1 /*  $Id: unit_test_validator.cpp 636458 2021-08-24 17:53:54Z fukanchi $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Colleen Bollin, NCBI
27 *
28 * File Description:
29 *   Unit tests for the validator.
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include "unit_test_validator.hpp"
37 
38 #include <corelib/ncbi_system.hpp>
39 
40 // This macro should be defined before inclusion of test_boost.hpp in all
41 // "*.cpp" files inside executable except one. It is like function main() for
42 // non-Boost.Test executables is defined only in one *.cpp file - other files
43 // should not include it. If NCBI_BOOST_NO_AUTO_TEST_MAIN will not be defined
44 // then test_boost.hpp will define such "main()" function for tests.
45 //
46 // Usually if your unit tests contain only one *.cpp file you should not
47 // care about this macro at all.
48 //
49 //#define NCBI_BOOST_NO_AUTO_TEST_MAIN
50 
51 #define BAD_VALIDATOR
52 
53 // This header must be included before all Boost.Test headers if there are any
54 #include <corelib/test_boost.hpp>
55 
56 // for ignoring external config files
57 #include <util/util_misc.hpp>
58 
59 #include <objects/biblio/Id_pat.hpp>
60 #include <objects/biblio/Title.hpp>
61 #include <objects/general/Object_id.hpp>
62 #include <objects/general/Dbtag.hpp>
63 #include <objects/general/User_object.hpp>
64 #include <objects/medline/Medline_entry.hpp>
65 #include <objects/misc/sequence_macros.hpp>
66 #include <objects/pub/Pub_equiv.hpp>
67 #include <objects/pub/Pub.hpp>
68 #include <objects/seqset/Seq_entry.hpp>
69 #include <objects/seq/GIBB_mol.hpp>
70 #include <objects/seq/Seq_ext.hpp>
71 #include <objects/seq/Delta_ext.hpp>
72 #include <objects/seq/Delta_seq.hpp>
73 #include <objects/seq/Seq_literal.hpp>
74 #include <objects/seq/Ref_ext.hpp>
75 #include <objects/seq/Map_ext.hpp>
76 #include <objects/seq/Seg_ext.hpp>
77 #include <objects/seq/Seq_gap.hpp>
78 #include <objects/seq/Seq_data.hpp>
79 #include <objects/seq/Seq_descr.hpp>
80 #include <objects/seq/Seqdesc.hpp>
81 #include <objects/seq/MolInfo.hpp>
82 #include <objects/seq/Pubdesc.hpp>
83 #include <objects/seq/Seq_hist.hpp>
84 #include <objects/seq/Seq_hist_rec.hpp>
85 #include <objects/seqalign/Dense_seg.hpp>
86 #include <objects/seqblock/GB_block.hpp>
87 #include <objects/seqblock/EMBL_block.hpp>
88 #include <objects/seqfeat/BioSource.hpp>
89 #include <objects/seqfeat/Org_ref.hpp>
90 #include <objects/seqfeat/OrgName.hpp>
91 #include <objects/seqfeat/SubSource.hpp>
92 #include <objects/seqfeat/Imp_feat.hpp>
93 #include <objects/seqfeat/Cdregion.hpp>
94 #include <objects/seqloc/Seq_id.hpp>
95 #include <objects/seqloc/PDB_seq_id.hpp>
96 #include <objects/seqloc/Giimport_id.hpp>
97 #include <objects/seqloc/Patent_seq_id.hpp>
98 #include <objects/seqloc/Seq_loc.hpp>
99 #include <objects/seqloc/Seq_interval.hpp>
100 #include <objects/macro/Suspect_rule_set.hpp>
101 #include <objects/taxon3/taxon3.hpp>
102 #include <objmgr/object_manager.hpp>
103 #include <objmgr/scope.hpp>
104 #include <objmgr/bioseq_ci.hpp>
105 #include <objmgr/feat_ci.hpp>
106 #include <objmgr/seq_vector.hpp>
107 #include <objmgr/util/sequence.hpp>
108 #include <objmgr/seqdesc_ci.hpp>
109 #include <objmgr/util/sequence.hpp>
110 #include <objects/seq/seqport_util.hpp>
111 #include <objtools/validator/validator.hpp>
112 #include <objtools/validator/validatorp.hpp>
113 #include <objtools/validator/utilities.hpp>
114 #include <objtools/validator/validerror_format.hpp>
115 #include <objtools/validator/translation_problems.hpp>
116 #include <objtools/validator/go_term_validation_and_cleanup.hpp>
117 #include <corelib/ncbiapp.hpp>
118 #include <common/ncbi_export.h>
119 #include <objtools/unit_test_util/unit_test_util.hpp>
120 #include <objtools/edit/struc_comm_field.hpp>
121 #include <objtools/edit/dblink_field.hpp>
122 #include <objtools/edit/cds_fix.hpp>
123 #include <objtools/validator/dup_feats.hpp>
124 
125 // for writing out tmp files
126 #include <serial/objostrasn.hpp>
127 #include <serial/objostrasnb.hpp>
128 
129 extern const char* sc_TestEntryCollidingLocusTags;
130 
131 BEGIN_NCBI_SCOPE
132 BEGIN_SCOPE(objects)
133 
134 using namespace validator;
135 using namespace unit_test_util;
136 
137 
138 
139 
CExpectedError(string accession,EDiagSev severity,string err_code,string err_msg)140 CExpectedError::CExpectedError(string accession, EDiagSev severity, string err_code, string err_msg)
141 : m_Accession (accession), m_Severity (severity), m_ErrCode(err_code), m_ErrMsg(err_msg)
142 {
143 
144 }
145 
146 
~CExpectedError()147 CExpectedError::~CExpectedError()
148 {
149 }
150 
151 
Match(const CValidErrItem & err_item,bool ignore_severity)152 bool CExpectedError::Match(const CValidErrItem& err_item, bool ignore_severity)
153 {
154     if (!NStr::IsBlank(m_Accession) && !NStr::IsBlank(err_item.GetAccnver()) &&
155         !NStr::Equal(err_item.GetAccnver(), m_Accession)) {
156         return false;
157     }
158     if (!NStr::Equal(err_item.GetErrCode(), m_ErrCode)) {
159         return false;
160     }
161     string msg = err_item.GetMsg();
162     size_t pos = NStr::Find(msg, " EXCEPTION: NCBI C++ Exception:");
163     if (pos != string::npos) {
164         msg = msg.substr(0, pos);
165     }
166 
167     if (!NStr::Equal(msg, m_ErrMsg)) {
168         return false;
169     }
170     if (!ignore_severity && m_Severity != err_item.GetSeverity()) {
171         return false;
172     }
173     return true;
174 }
175 
176 
Test(const CValidErrItem & err_item)177 void CExpectedError::Test(const CValidErrItem& err_item)
178 {
179     if (!NStr::IsBlank (m_Accession) && !NStr::IsBlank (err_item.GetAccnver())) {
180         BOOST_CHECK_EQUAL(err_item.GetAccnver(), m_Accession);
181     }
182     BOOST_CHECK_EQUAL(err_item.GetSeverity(), m_Severity);
183     BOOST_CHECK_EQUAL(err_item.GetErrCode(), m_ErrCode);
184     string msg = err_item.GetMsg();
185     size_t pos = NStr::Find(msg, " EXCEPTION: NCBI C++ Exception:");
186     if (pos != string::npos) {
187         msg = msg.substr(0, pos);
188     }
189     BOOST_CHECK_EQUAL(msg, m_ErrMsg);
190 }
191 
192 
PrintSeenError(const CValidErrItem & err_item)193 void CExpectedError::PrintSeenError(const CValidErrItem& err_item)
194 {
195     string description = err_item.GetAccnver() + ":"
196         + CValidErrItem::ConvertSeverity(err_item.GetSeverity()) + ":"
197         + err_item.GetErrCode() + ":"
198         + err_item.GetMsg();
199     printf("%s\n", description.c_str());
200 
201 }
202 
203 
Print() const204 void CExpectedError::Print() const
205 {
206     string description = m_Accession + ":"
207         + CValidErrItem::ConvertSeverity(m_Severity) + ":"
208         + m_ErrCode + ":"
209         + m_ErrMsg;
210     printf("%s\n", description.c_str());
211 
212 }
213 
214 
215 static bool s_debugMode = false;
216 
WriteErrors(const CValidError & eval,bool debug_mode)217 void WriteErrors(const CValidError& eval, bool debug_mode)
218 {
219     if (debug_mode) {
220         printf ("\n-\n");
221     }
222     for ( CValidError_CI vit(eval); vit; ++vit) {
223         CExpectedError::PrintSeenError(*vit);
224     }
225     if (debug_mode) {
226         printf ("\n\n");
227     }
228         printf ("\n\n");
229 }
230 
231 
CheckErrors(const CValidError & eval,vector<CExpectedError * > & expected_errors)232 void CheckErrors(const CValidError& eval,
233                  vector< CExpectedError* >& expected_errors)
234 {
235     bool   problem_found = false;
236 
237     if (s_debugMode) {
238         WriteErrors (eval, true);
239         return;
240     }
241 
242     vector<bool> expected_found;
243     for (size_t i = 0; i < expected_errors.size(); i++) {
244         if (expected_errors[i]) {
245             expected_found.push_back(false);
246         } else {
247             expected_found.push_back(true);
248         }
249     }
250 
251     for (CValidError_CI vit(eval); vit; ++vit) {
252         bool found = false;
253         for (size_t i = 0; i < expected_errors.size(); i++) {
254             if (!expected_found[i] && expected_errors[i]->Match(*vit)) {
255                 expected_found[i] = true;
256                 found = true;
257                 break;
258             }
259         }
260         if (!found) {
261             for (size_t i = 0; i < expected_errors.size(); i++) {
262                 if (!expected_found[i] && expected_errors[i]->Match(*vit, true)) {
263                     printf("Problem with ");
264                     CExpectedError::PrintSeenError(*vit);
265                     expected_errors[i]->Test(*vit);
266                     expected_found[i] = true;
267                     found = true;
268                     problem_found = true;
269                     break;
270                 }
271             }
272         }
273         if (!found) {
274             BOOST_CHECK_EQUAL("Unexpected error", "Error not found");
275             CExpectedError::PrintSeenError(*vit);
276             problem_found = true;
277         }
278     }
279 
280     for (size_t i = 0; i < expected_errors.size(); i++) {
281         if (!expected_found[i]) {
282             BOOST_CHECK_EQUAL(expected_errors[i]->GetErrMsg(), "Expected error not found");
283             problem_found = true;
284         }
285     }
286 
287     if (problem_found) {
288         WriteErrors (eval, false);
289 
290         printf("Expected:\n");
291         for (auto it : expected_errors) {
292             if (it) {
293                 it->Print();
294             }
295         }
296     }
297 }
298 
299 
CheckStrings(const vector<string> & seen,const vector<string> & expected)300 void CheckStrings(const vector<string>& seen, const vector<string>& expected)
301 {
302     auto it1 = seen.begin();
303     auto it2 = expected.begin();
304     bool any = false;
305     while (it1 != seen.end() && it2 != expected.end()) {
306         BOOST_CHECK_EQUAL(*it1, *it2);
307         if (!NStr::Equal(*it1, *it2)) {
308             any = true;
309         }
310         it1++;
311         it2++;
312     }
313     while (it1 != seen.end()) {
314         BOOST_CHECK_EQUAL(*it1, "Unexpected string");
315         it1++;
316         any = true;
317     }
318     while (it2 != expected.end()) {
319         BOOST_CHECK_EQUAL("Missing string", *it2);
320         it2++;
321         any = true;
322     }
323 
324     if (any) {
325         printf("Seen:\n");
326         auto it1 = seen.begin();
327         while (it1 != seen.end()) {
328             printf("%s\n", (*it1).c_str());
329             it1++;
330         }
331         printf("Expected:\n");
332         auto it2 = expected.begin();
333         while (it2 != expected.end()) {
334             printf("%s\n", (*it2).c_str());
335             it2++;
336         }
337     }
338 }
339 
340 
341 // Not currently used, but I'll leave it here in case
342 // it's useful in the future.
343 
344 //static void SetCountryOnSrc (CBioSource& src, string country)
345 //{
346 //    if (NStr::IsBlank(country)) {
347 //        EDIT_EACH_SUBSOURCE_ON_BIOSOURCE (it, src) {
348 //            if ((*it)->IsSetSubtype() && (*it)->GetSubtype() == CSubSource::eSubtype_country) {
349 //                ERASE_SUBSOURCE_ON_BIOSOURCE (it, src);
350 //            }
351 //        }
352 //    } else {
353 //        CRef<CSubSource> sub(new CSubSource(CSubSource::eSubtype_country, country));
354 //        src.SetSubtype().push_back(sub);
355 //    }
356 //}
357 
358 
359 
360 END_SCOPE(objects)
361 END_NCBI_SCOPE
362 
363 USING_NCBI_SCOPE;
364 USING_SCOPE(objects);
365 
NCBITEST_INIT_TREE()366 NCBITEST_INIT_TREE()
367 {
368     if ( !CNcbiApplication::Instance()->GetConfig().HasEntry("NCBI", "Data") ) {
369         NCBITEST_DISABLE(Test_Descr_BadStructuredCommentFormat);
370         NCBITEST_DISABLE(Test_Descr_MissingKeyword);
371     }
372 }
373 
374 
SetErrorsAccessions(vector<CExpectedError * > & expected_errors,string accession)375 static void SetErrorsAccessions (vector< CExpectedError *> & expected_errors, string accession)
376 {
377     size_t i, len = expected_errors.size();
378     for (i = 0; i < len; i++) {
379         expected_errors[i]->SetAccession(accession);
380     }
381 }
382 
NCBITEST_INIT_CMDLINE(arg_desc)383 NCBITEST_INIT_CMDLINE(arg_desc)
384 {
385     // Here we make descriptions of command line parameters that we are
386     // going to use.
387 
388     arg_desc->AddFlag( "debug_mode",
389         "Debugging mode writes errors seen for each test" );
390 }
391 
NCBITEST_AUTO_INIT()392 NCBITEST_AUTO_INIT()
393 {
394     // initialization function body
395 
396     const CArgs& args = CNcbiApplication::Instance()->GetArgs();
397     if (args["debug_mode"]) {
398         s_debugMode = true;
399     }
400     g_IgnoreDataFile("institution_codes.txt");
401 }
402 
AddChromosomeNoLocation(vector<CExpectedError * > & expected_errors,const string & id)403 void AddChromosomeNoLocation(vector< CExpectedError *>& expected_errors, const string& id)
404 {
405     expected_errors.push_back(new CExpectedError(id, eDiag_Error, "ChromosomeWithoutLocation",
406         "INDEXER_ONLY - source contains chromosome value '1' but the BioSource location is not set to chromosome"));
407 }
408 
AddChromosomeNoLocation(vector<CExpectedError * > & expected_errors,CRef<CSeq_entry> entry)409 void AddChromosomeNoLocation(vector< CExpectedError *>& expected_errors, CRef<CSeq_entry> entry)
410 {
411     if (entry->IsSeq()) {
412         CConstRef<CSeq_id> seqid = sequence::GetId(entry->GetSeq(), sequence::eGetId_Best).GetSeqId();
413         AddChromosomeNoLocation(expected_errors, seqid->AsFastaString());
414     } else if (entry->IsSet()) {
415         if (entry->GetSet().GetClass() == CBioseq_set::eClass_nuc_prot) {
416             CRef<CSeq_entry> nuc_entry = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
417             AddChromosomeNoLocation(expected_errors, nuc_entry);
418         } else {
419             for (auto it : entry->SetSet().SetSeq_set()) {
420                 AddChromosomeNoLocation(expected_errors, it);
421             }
422         }
423     }
424 }
425 
426 
427 // new case test ground
BOOST_AUTO_TEST_CASE(Test_Descr_LatLonValue)428 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonValue)
429 {
430     // prepare entry
431     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
432     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "USA");
433     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "35 S 80 W");
434 
435     STANDARD_SETUP
436 
437     /*
438     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
439                               "Latitude should be set to N (northern hemisphere)"));
440     eval = validator.Validate(seh, options);
441     CheckErrors (*eval, expected_errors);
442     */
443 
444     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "");
445     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "35 N 80 E");
446     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
447                               "Longitude should be set to W (western hemisphere)"));
448     eval = validator.Validate(seh, options);
449     CheckErrors (*eval, expected_errors);
450 
451     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "");
452     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "Madagascar");
453     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "");
454     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "25 N 47 E");
455     expected_errors[0]->SetErrMsg("Latitude should be set to S (southern hemisphere)");
456     eval = validator.Validate(seh, options);
457     CheckErrors (*eval, expected_errors);
458 
459     /*
460     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "");
461     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "25 S 47 W");
462     expected_errors[0]->SetErrMsg("Longitude should be set to E (eastern hemisphere)");
463     eval = validator.Validate(seh, options);
464     CheckErrors (*eval, expected_errors);
465     */
466 
467     CLEAR_ERRORS
468 
469     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "");
470     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "15 N 47 E");
471     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "");
472     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "Austria");
473     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonValue",
474                               "Latitude and longitude values appear to be exchanged"));
475     eval = validator.Validate(seh, options);
476     CheckErrors (*eval, expected_errors);
477 
478     CLEAR_ERRORS
479 }
480 
481 
TestOneLatLonCountry(const string & country,const string & lat_lon,const string & error,bool use_state=false,const string & err_code="LatLonCountry")482 void TestOneLatLonCountry(const string& country, const string& lat_lon, const string& error, bool use_state = false, const string& err_code = "LatLonCountry")
483 {
484     // prepare entry
485     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
486     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, country);
487     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, lat_lon);
488 
489     STANDARD_SETUP
490 
491     if (use_state) {
492         options |= CValidator::eVal_latlon_check_state;
493     }
494 
495     if (!error.empty()) {
496         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, err_code, error));
497     }
498     eval = validator.Validate(seh, options);
499     CheckErrors(*eval, expected_errors);
500 
501     if (!error.empty()) {
502         CValidErrorFormat format(*objmgr);
503         vector<string> expected;
504         expected.push_back("LatLonCountry Errors");
505         expected.push_back("lcl|good:" + error);
506         expected.push_back("");
507 
508         vector<string> seen;
509         vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
510         ITERATE(vector<string>, it, cat_list) {
511             vector<string> sublist;
512             NStr::Split(*it, "\n", sublist, 0);
513             ITERATE(vector<string>, sit, sublist) {
514                 seen.push_back(*sit);
515             }
516         }
517 
518         CheckStrings(seen, expected);
519     }
520 
521     CLEAR_ERRORS
522 }
523 
524 
BOOST_AUTO_TEST_CASE(Test_VR_840)525 BOOST_AUTO_TEST_CASE(Test_VR_840)
526 {
527     TestOneLatLonCountry("Portugal", "37.7715 N 25.3097 W", "", true);
528 }
529 
530 
531 
BOOST_AUTO_TEST_CASE(Test_Descr_LatLonCountry)532 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonCountry)
533 {
534     TestOneLatLonCountry("Romania", "46.5 N 20 E",
535         "Lat_lon '46.5 N 20 E' maps to 'Hungary' instead of 'Romania' - claimed region 'Romania' is at distance 45 km");
536     TestOneLatLonCountry("Romania", "34 N 65 E", "Lat_lon '34 N 65 E' maps to 'Afghanistan' instead of 'Romania'");
537     TestOneLatLonCountry("Romania", "48 N 15 E", "Lat_lon '48 N 15 E' maps to 'Austria' instead of 'Romania'");
538     TestOneLatLonCountry("Romania", "48 N 15 W", "Lat_lon '48 N 15 W' is in water 'Atlantic Ocean'", false, "LatLonWater");
539     // RW-1137 this had inconsistent behavior in production vs. development tests, possibly due to version skew in
540     // Puerto Rico cleanup code, so commenting out to avoid spurious error reports
541     /*
542     TestOneLatLonCountry("Puerto Rico: Rio Mameyes in Luquillo", "18.47 N 64.23000000000002 W",
543         "Lat_lon '18.47 N 64.23000000000002 W' is in water 'Caribbean Sea', 'Puerto Rico: Rio Mameyes in Luquillo' is 108 km away",
544         false, "LatLonWater");
545     */
546 
547 }
548 
549 
BOOST_AUTO_TEST_CASE(Test_ValidError_Format)550 BOOST_AUTO_TEST_CASE(Test_ValidError_Format)
551 {
552     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
553 
554     // Create consensus splice problems
555     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
556     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
557     cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc->SetSeq().SetId().front()));
558     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = 'A';
559     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = 'G';
560     CRef<CSeq_feat> intron = unit_test_util::MakeIntronForMixLoc(nuc->SetSeq().SetId().front());
561     unit_test_util::AddFeat(intron, nuc);
562 
563     CRef<CSeq_feat> other_intron = unit_test_util::AddMiscFeature(nuc);
564     other_intron->SetData().SetImp().SetKey("intron");
565     CRef<CSeq_feat> gene = MakeGeneForFeature(cds);
566     gene->SetData().SetGene().SetLocus_tag("fake_locustag");
567     AddFeat(gene, nuc);
568 
569     // create EC number problems
570     CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
571     prot->SetData().SetProt().SetEc().push_back("1.2.3.10");
572     prot->SetData().SetProt().SetEc().push_back("1.1.3.22");
573     prot->SetData().SetProt().SetEc().push_back("1.1.99.n");
574     prot->SetData().SetProt().SetEc().push_back("1.1.1.17");
575     prot->SetData().SetProt().SetEc().push_back("11.22.33.44");
576     prot->SetData().SetProt().SetEc().push_back("11.22.n33.44");
577     prot->SetData().SetProt().SetEc().push_back("11.22.33.n44");
578 
579 
580     // create bad institution code errors
581     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "XXX:foo");
582     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "YYY:foo");
583     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "ZZZ:foo");
584 
585     // create lat-lon country error
586     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "30 N 30 E");
587     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "Panama");
588 
589     STANDARD_SETUP
590 
591     eval = validator.Validate(seh, options);
592 
593     CValidErrorFormat format(*objmgr);
594 
595     vector<string> expected;
596     expected.push_back("intron\tlcl|nuc\tGT at 17");
597     expected.push_back("intron\tlcl|nuc\tGT at 1");
598     expected.push_back("intron\tlcl|nuc\tAG at 11");
599     expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
600     expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
601     expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
602     expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
603     expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
604     expected.push_back("CDS\tlcl|nuc\tGT at 16");
605     expected.push_back("lcl|nuc:Lat_lon '30 N 30 E' maps to 'Egypt' instead of 'Panama'");
606     expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
607     expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
608     expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
609 
610     vector<string> seen;
611     for (CValidError_CI vit(*eval); vit; ++vit) {
612         string val = format.FormatForSubmitterReport(*vit, scope);
613         seen.push_back(val);
614     }
615     CheckStrings(seen, expected);
616 
617     expected.clear();
618     seen.clear();
619     for (CValidError_CI vit(*eval); vit; ++vit) {
620         seen.push_back(vit->GetErrCode());
621     }
622     expected.push_back("NotSpliceConsensusDonor");
623     expected.push_back("NotSpliceConsensusDonorTerminalIntron");
624     expected.push_back("NotSpliceConsensusAcceptor");
625     expected.push_back("DeletedEcNumber");
626     expected.push_back("ReplacedEcNumber");
627     expected.push_back("BadEcNumberValue");
628     expected.push_back("BadEcNumberFormat");
629     expected.push_back("BadEcNumberValue");
630     expected.push_back("NotSpliceConsensusDonor");
631     expected.push_back("LatLonCountry");
632     expected.push_back("BadInstitutionCode");
633     expected.push_back("BadInstitutionCode");
634     expected.push_back("BadInstitutionCode");
635     CheckStrings(seen, expected);
636 
637     seen.clear();
638     expected.clear();
639     vector<unsigned int> codes = format.GetListOfErrorCodes(*eval);
640     ITERATE(vector<unsigned int>, it, codes) {
641         string val = CValidErrItem::ConvertErrCode(*it);
642         seen.push_back(val);
643     }
644     expected.push_back("LatLonCountry");
645     expected.push_back("BadInstitutionCode");
646     expected.push_back("BadEcNumberFormat");
647     expected.push_back("BadEcNumberValue");
648     expected.push_back("NotSpliceConsensusDonor");
649     expected.push_back("NotSpliceConsensusAcceptor");
650     expected.push_back("DeletedEcNumber");
651     expected.push_back("ReplacedEcNumber");
652     expected.push_back("NotSpliceConsensusDonorTerminalIntron");
653     CheckStrings(seen, expected);
654 
655     string rval = format.FormatForSubmitterReport(*eval, scope, eErr_SEQ_FEAT_NotSpliceConsensusDonor);
656     expected.clear();
657     seen.clear();
658     NStr::Split(rval, "\n", seen, 0);
659     expected.push_back("Not Splice Consensus");
660     expected.push_back("intron\tlcl|nuc\tGT at 17");
661     expected.push_back("CDS\tlcl|nuc\tGT at 16");
662     expected.push_back("");
663     CheckStrings(seen, expected);
664 
665     rval = format.FormatCategoryForSubmitterReport(*eval, scope, eSubmitterFormatErrorGroup_ConsensusSplice);
666     expected.clear();
667     seen.clear();
668     NStr::Split(rval, "\n", seen, 0);
669     expected.push_back("Not Splice Consensus");
670     expected.push_back("intron\tlcl|nuc\tGT at 17");
671     expected.push_back("intron\tlcl|nuc\tGT at 1");
672     expected.push_back("intron\tlcl|nuc\tAG at 11");
673     expected.push_back("CDS\tlcl|nuc\tGT at 16");
674     expected.push_back("");
675     CheckStrings(seen, expected);
676 
677     expected.clear();
678     seen.clear();
679     vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
680     ITERATE(vector<string>, it, cat_list) {
681         vector<string> sublist;
682         NStr::Split(*it, "\n", sublist, 0);
683         ITERATE(vector<string>, sit, sublist) {
684             seen.push_back(*sit);
685         }
686     }
687     expected.push_back("Not Splice Consensus");
688     expected.push_back("intron\tlcl|nuc\tGT at 17");
689     expected.push_back("intron\tlcl|nuc\tGT at 1");
690     expected.push_back("intron\tlcl|nuc\tAG at 11");
691     expected.push_back("CDS\tlcl|nuc\tGT at 16");
692     expected.push_back("");
693     expected.push_back("EC Number Format");
694     expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
695     expected.push_back("");
696     expected.push_back("EC Number Value");
697     expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
698     expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
699     expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
700     expected.push_back("lcl|prot\t1.2.3.10;1.1.3.22;1.1.99.n;1.1.1.17;11.22.33.44;11.22.n33.44;11.22.33.n44\t\tfake protein name");
701     expected.push_back("");
702     expected.push_back("Bad Institution Codes");
703     expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
704     expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
705     expected.push_back("lcl|nuc\tXXX;YYY;ZZZ");
706     expected.push_back("");
707     expected.push_back("LatLonCountry Errors");
708     expected.push_back("lcl|nuc:Lat_lon '30 N 30 E' maps to 'Egypt' instead of 'Panama'");
709     expected.push_back("");
710     CheckStrings(seen, expected);
711 
712 }
713 
714 
BOOST_AUTO_TEST_CASE(Test_GB_6395)715 BOOST_AUTO_TEST_CASE(Test_GB_6395)
716 {
717     // prepare entry
718     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
719     unit_test_util::SetTaxon(entry, 0);
720 
721     STANDARD_SETUP
722 
723     eval = validator.Validate(seh, options);
724 
725     CValidErrorFormat format(*objmgr);
726     vector<string> expected;
727     vector<string> seen;
728 
729     vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
730     ITERATE(vector<string>, it, cat_list) {
731         vector<string> sublist;
732         NStr::Split(*it, "\n", sublist, 0);
733         ITERATE(vector<string>, sit, sublist) {
734             seen.push_back(*sit);
735         }
736     }
737     expected.push_back("NoTaxonID");
738     expected.push_back("lcl|good:Sebaea microphylla");
739     expected.push_back("");
740 
741     CheckStrings(seen, expected);
742 }
743 
744 
BOOST_AUTO_TEST_CASE(Test_Descr_LatLonState)745 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonState)
746 {
747     // prepare entry
748     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
749     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "USA: South Carolina");
750     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "36 N 80 W");
751 
752     STANDARD_SETUP
753 
754     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "LatLonState",
755     "Lat_lon '36 N 80 W' maps to 'USA: North Carolina' instead of 'USA: South Carolina' - claimed region 'USA: South Carolina' is at distance 130 km"));
756     //AddChromosomeNoLocation(expected_errors, "lcl|good");
757     options |= CValidator::eVal_latlon_check_state;
758     eval = validator.Validate(seh, options);
759     CheckErrors (*eval, expected_errors);
760 
761     CLEAR_ERRORS
762 }
763 
764 
s_BuildBadEcNumberEntry()765 CRef<CSeq_entry> s_BuildBadEcNumberEntry()
766 {
767     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet ();
768     CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
769     prot->SetData().SetProt().SetEc().push_back("1.2.3.10");
770     prot->SetData().SetProt().SetEc().push_back("1.1.3.22");
771     prot->SetData().SetProt().SetEc().push_back("1.1.99.n");
772     prot->SetData().SetProt().SetEc().push_back("1.1.1.17");
773     prot->SetData().SetProt().SetEc().push_back("11.22.33.44");
774     prot->SetData().SetProt().SetEc().push_back("11.22.n33.44");
775     prot->SetData().SetProt().SetEc().push_back("11.22.33.n44");
776     return entry;
777 }
778 
779 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadEcNumberValue)780 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadEcNumberValue)
781 {
782     CRef<CSeq_entry> entry = s_BuildBadEcNumberEntry ();
783     CRef<CSeq_feat> prot = entry->GetSet().GetSeq_set().back()->GetAnnot().front()->GetData().GetFtable().front();
784 
785     STANDARD_SETUP
786 
787     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "DeletedEcNumber",
788                       "EC_number 1.2.3.10 was deleted"));
789     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "ReplacedEcNumber",
790                       "EC_number 1.1.3.22 was transferred and is no longer valid"));
791     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadEcNumberValue",
792                       "11.22.33.44 is not a legal value for qualifier EC_number"));
793     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadEcNumberFormat",
794                       "11.22.n33.44 is not in proper EC_number format"));
795     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Info, "BadEcNumberValue",
796                       "11.22.33.n44 is not a legal preliminary value for qualifier EC_number"));
797     //AddChromosomeNoLocation(expected_errors, "lcl|nuc");
798     eval = validator.Validate(seh, options);
799     CheckErrors (*eval, expected_errors);
800 
801     scope.RemoveTopLevelSeqEntry(seh);
802     prot->SetData().SetProt().ResetEc();
803     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature (entry->GetSet().GetSeq_set().front());
804     misc->SetData().SetImp().SetKey("exon");
805     misc->AddQualifier("EC_number", "1.2.3.10");
806     misc->AddQualifier("EC_number", "1.1.3.22");
807     misc->AddQualifier("EC_number", "1.1.99.n");
808     misc->AddQualifier("EC_number", "1.1.1.17");
809     misc->AddQualifier("EC_number", "11.22.33.44");
810     misc->AddQualifier("EC_number", "11.22.n33.44");
811     misc->AddQualifier("EC_number", "11.22.33.n44");
812     SetErrorsAccessions(expected_errors, "lcl|nuc");
813     expected_errors[1]->SetErrMsg("EC_number 1.1.3.22 was replaced");
814     seh = scope.AddTopLevelSeqEntry(*entry);
815     eval = validator.Validate(seh, options);
816     CheckErrors (*eval, expected_errors);
817 
818     CLEAR_ERRORS
819 }
820 
821 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidQualifierValue)822 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidQualifierValue)
823 {
824     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
825     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature (entry);
826     misc->SetData().SetImp().SetKey("repeat_region");
827     misc->AddQualifier("rpt_unit_seq", "ATA");
828 
829     STANDARD_SETUP
830 
831     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "RepeatSeqDoNotMatch",
832                       "repeat_region /rpt_unit and underlying sequence do not match"));
833     //AddChromosomeNoLocation(expected_errors, "lcl|good");
834     eval = validator.Validate(seh, options);
835     CheckErrors (*eval, expected_errors);
836 
837     scope.RemoveTopLevelSeqEntry(seh);
838     entry = unit_test_util::BuildGoodSeq();
839     misc = unit_test_util::AddMiscFeature(entry);
840     misc->SetData().SetImp().SetKey("repeat_region");
841     misc->AddQualifier("rpt_unit_seq", "ATAGTGATAGTG");
842     seh = scope.AddTopLevelSeqEntry(*entry);
843     expected_errors[0]->SetErrCode("InvalidRepeatUnitLength");
844     expected_errors[0]->SetErrMsg("Length of rpt_unit_seq is greater than feature length");
845     expected_errors[0]->SetSeverity(eDiag_Info);
846     eval = validator.Validate(seh, options);
847     CheckErrors (*eval, expected_errors);
848 
849     CLEAR_ERRORS
850 }
851 
852 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ExtNotAllowed)853 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ExtNotAllowed)
854 {
855     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
856 
857     STANDARD_SETUP
858 
859     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "ExtNotAllowed", "Bioseq-ext not allowed on virtual Bioseq"));
860     //AddChromosomeNoLocation(expected_errors, "lcl|good");
861 
862     // repr = virtual
863     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_virtual);
864     entry->SetSeq().SetInst().ResetSeq_data();
865     entry->SetSeq().SetInst().SetExt().SetDelta();
866     eval = validator.Validate(seh, options);
867     CheckErrors (*eval, expected_errors);
868 
869     // repr = raw
870     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_raw);
871     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
872     expected_errors[0]->SetErrMsg("Bioseq-ext not allowed on raw Bioseq");
873     eval = validator.Validate(seh, options);
874     CheckErrors (*eval, expected_errors);
875 
876     entry->SetSeq().SetInst().ResetExt();
877     entry->SetSeq().SetInst().ResetSeq_data();
878     expected_errors[0]->SetErrCode("SeqDataNotFound");
879     expected_errors[0]->SetErrMsg("Missing Seq-data on raw Bioseq");
880     expected_errors[0]->SetSeverity(eDiag_Critical);
881     eval = validator.Validate(seh, options);
882     CheckErrors (*eval, expected_errors);
883 
884     entry->SetSeq().SetInst().SetSeq_data().SetGap();
885     eval = validator.Validate(seh, options);
886     CheckErrors (*eval, expected_errors);
887 
888     // repr = const
889     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_const);
890     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
891     entry->SetSeq().SetInst().SetExt().SetDelta();
892     expected_errors[0]->SetErrCode("ExtNotAllowed");
893     expected_errors[0]->SetErrMsg("Bioseq-ext not allowed on constructed Bioseq");
894     eval = validator.Validate(seh, options);
895     CheckErrors (*eval, expected_errors);
896 
897     entry->SetSeq().SetInst().ResetExt();
898     entry->SetSeq().SetInst().ResetSeq_data();
899     expected_errors[0]->SetErrCode("SeqDataNotFound");
900     expected_errors[0]->SetErrMsg("Missing Seq-data on constructed Bioseq");
901     expected_errors[0]->SetSeverity(eDiag_Critical);
902     eval = validator.Validate(seh, options);
903     CheckErrors (*eval, expected_errors);
904 
905     entry->SetSeq().SetInst().SetSeq_data().SetGap();
906     eval = validator.Validate(seh, options);
907     CheckErrors (*eval, expected_errors);
908 
909     // repr = map
910     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_map);
911     entry->SetSeq().SetInst().ResetSeq_data();
912     expected_errors[0]->SetErrCode("ExtBadOrMissing");
913     expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on map Bioseq");
914     expected_errors[0]->SetSeverity(eDiag_Error);
915     eval = validator.Validate(seh, options);
916     CheckErrors (*eval, expected_errors);
917 
918     entry->SetSeq().SetInst().SetExt().SetDelta();
919     eval = validator.Validate(seh, options);
920     CheckErrors (*eval, expected_errors);
921 
922     entry->SetSeq().SetInst().SetExt().SetRef();
923     eval = validator.Validate(seh, options);
924     CheckErrors (*eval, expected_errors);
925 
926     entry->SetSeq().SetInst().SetExt().SetMap();
927     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
928     expected_errors[0]->SetErrCode("SeqDataNotAllowed");
929     expected_errors[0]->SetErrMsg("Seq-data not allowed on map Bioseq");
930     eval = validator.Validate(seh, options);
931     CheckErrors (*eval, expected_errors);
932 
933 
934     // repr = ref
935     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_ref);
936     entry->SetSeq().SetInst().ResetExt();
937     entry->SetSeq().SetInst().ResetSeq_data();
938     expected_errors[0]->SetErrCode("ExtBadOrMissing");
939     expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on reference Bioseq");
940     eval = validator.Validate(seh, options);
941     CheckErrors (*eval, expected_errors);
942 
943     // repr = seg
944     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
945     expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on seg Bioseq");
946     eval = validator.Validate(seh, options);
947     CheckErrors (*eval, expected_errors);
948 
949     // repr = consen
950     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_consen);
951     expected_errors[0]->SetSeverity(eDiag_Critical);
952     expected_errors[0]->SetErrCode("ReprInvalid");
953     expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 6");
954     eval = validator.Validate(seh, options);
955     CheckErrors (*eval, expected_errors);
956 
957     // repr = notset
958     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_not_set);
959     expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 0");
960     eval = validator.Validate(seh, options);
961     CheckErrors (*eval, expected_errors);
962 
963     // repr = other
964     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_other);
965     expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 255");
966     eval = validator.Validate(seh, options);
967     CheckErrors (*eval, expected_errors);
968 
969     // repr = delta
970     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
971     entry->SetSeq().SetInst().SetExt().SetDelta();
972     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGG");
973     expected_errors[0]->SetSeverity(eDiag_Error);
974     expected_errors[0]->SetErrCode("SeqDataNotAllowed");
975     expected_errors[0]->SetErrMsg("Seq-data not allowed on delta Bioseq");
976     eval = validator.Validate(seh, options);
977     CheckErrors (*eval, expected_errors);
978 
979     entry->SetSeq().SetInst().ResetExt();
980     entry->SetSeq().SetInst().ResetSeq_data();
981     expected_errors[0]->SetSeverity(eDiag_Error);
982     expected_errors[0]->SetErrCode("ExtBadOrMissing");
983     expected_errors[0]->SetErrMsg("Missing or incorrect Bioseq-ext on delta Bioseq");
984     eval = validator.Validate(seh, options);
985     CheckErrors (*eval, expected_errors);
986 
987     CLEAR_ERRORS
988 }
989 
990 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ReprInvalid)991 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ReprInvalid)
992 {
993     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
994 
995     STANDARD_SETUP
996     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "ReprInvalid", "Invalid Bioseq->repr = 0"));
997     //AddChromosomeNoLocation(expected_errors, "lcl|good");
998 
999     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_not_set);
1000     eval = validator.Validate(seh, options);
1001     CheckErrors (*eval, expected_errors);
1002 
1003     expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 255");
1004     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_other);
1005     eval = validator.Validate(seh, options);
1006     CheckErrors (*eval, expected_errors);
1007 
1008     expected_errors[0]->SetErrMsg("Invalid Bioseq->repr = 6");
1009     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_consen);
1010     eval = validator.Validate(seh, options);
1011     CheckErrors (*eval, expected_errors);
1012 
1013     CLEAR_ERRORS
1014 }
1015 
1016 
BOOST_AUTO_TEST_CASE(Test_CollidingLocusTags)1017 BOOST_AUTO_TEST_CASE(Test_CollidingLocusTags)
1018 {
1019     CRef<CSeq_entry> entry(new CSeq_entry());
1020     {{
1021          CNcbiIstrstream istr(sc_TestEntryCollidingLocusTags);
1022          istr >> MSerial_AsnText >> *entry;
1023      }}
1024 
1025     CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
1026     CScope scope(*objmgr);
1027     CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
1028 
1029     CValidator validator(*objmgr);
1030 
1031     // Set validator options
1032     unsigned int options = CValidator::eVal_need_isojta
1033                           | CValidator::eVal_far_fetch_mrna_products
1034                           | CValidator::eVal_validate_id_set | CValidator::eVal_indexer_version
1035                           | CValidator::eVal_use_entrez;
1036 
1037     // list of expected errors
1038     vector< CExpectedError *> expected_errors;
1039     expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Warning, "TerminalNs", "N at end of sequence"));
1040     expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Warning, "GeneLocusCollidesWithLocusTag", "locus collides with locus_tag in another gene"));
1041     expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "CollidingLocusTags", "Colliding locus_tags in gene features"));
1042     expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "CollidingLocusTags", "Colliding locus_tags in gene features"));
1043     expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoMolInfoFound", "No Mol-info applies to this Bioseq"));
1044     expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "LocusTagGeneLocusMatch", "Gene locus and locus_tag 'foo' match"));
1045     expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoPubFound", "No publications anywhere on this entire record."));
1046     expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Info, "MissingPubRequirement", "No submission citation anywhere on this entire record."));
1047     expected_errors.push_back(new CExpectedError("lcl|LocusCollidesWithLocusTag", eDiag_Error, "NoSourceDescriptor", "No source information included on this record."));
1048 
1049     CConstRef<CValidError> eval = validator.Validate(seh, options);
1050     CheckErrors (*eval, expected_errors);
1051 
1052     CLEAR_ERRORS
1053 }
1054 
1055 
1056 const char* sc_TestEntryCollidingLocusTags ="Seq-entry ::= seq {\
1057     id {\
1058       local str \"LocusCollidesWithLocusTag\" } ,\
1059     inst {\
1060       repr raw ,\
1061       mol dna ,\
1062       length 24 ,\
1063       seq-data\
1064         iupacna \"AATTGGCCAANNAATTGGCCAANN\" } ,\
1065     annot {\
1066       {\
1067         data\
1068           ftable {\
1069             {\
1070               data\
1071                 gene {\
1072                   locus \"foo\" ,\
1073                   locus-tag \"foo\" } ,\
1074               location\
1075                 int {\
1076                   from 0 ,\
1077                   to 4 ,\
1078                   strand plus ,\
1079                   id\
1080                     local str \"LocusCollidesWithLocusTag\" } } ,\
1081             {\
1082               data\
1083                 gene {\
1084                   locus \"bar\" ,\
1085                   locus-tag \"foo\" } ,\
1086               location\
1087                 int {\
1088                   from 5 ,\
1089                   to 9 ,\
1090                   strand plus ,\
1091                   id\
1092                     local str \"LocusCollidesWithLocusTag\" } } ,\
1093             {\
1094               data\
1095                 gene {\
1096                   locus \"bar\" ,\
1097                   locus-tag \"baz\" } ,\
1098               location\
1099                 int {\
1100                   from 10 ,\
1101                   to 14 ,\
1102                   strand plus ,\
1103                   id\
1104                     local str \"LocusCollidesWithLocusTag\" } } ,\
1105             {\
1106               data\
1107                 gene {\
1108                   locus \"quux\" ,\
1109                   locus-tag \"baz\" } ,\
1110               location\
1111                 int {\
1112                   from 15 ,\
1113                   to 19 ,\
1114                   strand plus ,\
1115                   id\
1116                     local str \"LocusCollidesWithLocusTag\" } } } } } }\
1117 ";
1118 
1119 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_CircularProtein)1120 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_CircularProtein)
1121 {
1122     CRef<CSeq_entry> entry = unit_test_util::BuildGoodProtSeq();
1123 
1124     STANDARD_SETUP
1125 
1126     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "CircularProtein", "Non-linear topology set on protein"));
1127     //AddChromosomeNoLocation(expected_errors, "lcl|good");
1128 
1129     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_complete);
1130 
1131     entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
1132     eval = validator.Validate(seh, options);
1133     CheckErrors (*eval, expected_errors);
1134 
1135     entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_tandem);
1136     eval = validator.Validate(seh, options);
1137     CheckErrors (*eval, expected_errors);
1138 
1139     entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_other);
1140     eval = validator.Validate(seh, options);
1141     CheckErrors (*eval, expected_errors);
1142 
1143     // should be no error for not set or linear
1144     CLEAR_ERRORS
1145 
1146     entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_not_set);
1147     eval = validator.Validate(seh, options);
1148     //AddChromosomeNoLocation(expected_errors, "lcl|good");
1149     CheckErrors (*eval, expected_errors);
1150 
1151     entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_linear);
1152     eval = validator.Validate(seh, options);
1153     CheckErrors (*eval, expected_errors);
1154 
1155     CLEAR_ERRORS
1156 }
1157 
1158 
BOOST_AUTO_TEST_CASE(Test_BadProteinMoltype)1159 BOOST_AUTO_TEST_CASE(Test_BadProteinMoltype)
1160 {
1161     CRef<CSeq_entry> entry = unit_test_util::BuildGoodProtSeq();
1162 
1163     STANDARD_SETUP
1164 
1165     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadProteinMoltype", "Protein not single stranded"));
1166     //AddChromosomeNoLocation(expected_errors, "lcl|good");
1167 
1168     entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ds);
1169     eval = validator.Validate(seh, options);
1170     CheckErrors (*eval, expected_errors);
1171 
1172     entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_mixed);
1173     eval = validator.Validate(seh, options);
1174     CheckErrors (*eval, expected_errors);
1175 
1176     entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_other);
1177     eval = validator.Validate(seh, options);
1178     CheckErrors (*eval, expected_errors);
1179 
1180     // no errors expected for not set or single strand
1181     CLEAR_ERRORS
1182 
1183     //AddChromosomeNoLocation(expected_errors, "lcl|good");
1184 
1185     entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_not_set);
1186     eval = validator.Validate(seh, options);
1187     CheckErrors (*eval, expected_errors);
1188 
1189     entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ss);
1190     eval = validator.Validate(seh, options);
1191     CheckErrors (*eval, expected_errors);
1192 
1193     CLEAR_ERRORS
1194 }
1195 
1196 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_MolNotSet)1197 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_MolNotSet)
1198 {
1199     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
1200 
1201     STANDARD_SETUP
1202 
1203     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MolNotSet", "Bioseq.mol is 0"));
1204     //AddChromosomeNoLocation(expected_errors, "lcl|good");
1205 
1206     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_not_set);
1207     eval = validator.Validate(seh, options);
1208     CheckErrors (*eval, expected_errors);
1209 
1210     expected_errors[0]->SetErrCode("MolOther");
1211     expected_errors[0]->SetErrMsg("Bioseq.mol is type other");
1212     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_other);
1213     eval = validator.Validate(seh, options);
1214     CheckErrors (*eval, expected_errors);
1215 
1216     expected_errors[0]->SetErrCode("MolNuclAcid");
1217     expected_errors[0]->SetErrMsg("Bioseq.mol is type nucleic acid");
1218     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_na);
1219     eval = validator.Validate(seh, options);
1220     CheckErrors (*eval, expected_errors);
1221 
1222     CLEAR_ERRORS
1223 
1224 }
1225 
1226 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_FuzzyLen)1227 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_FuzzyLen)
1228 {
1229     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
1230 
1231     STANDARD_SETUP
1232 
1233     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "FuzzyLen", "Fuzzy length on raw Bioseq"));
1234     //AddChromosomeNoLocation(expected_errors, "lcl|good");
1235 
1236     entry->SetSeq().SetInst().SetFuzz();
1237     eval = validator.Validate(seh, options);
1238     CheckErrors (*eval, expected_errors);
1239 
1240     expected_errors[0]->SetErrMsg("Fuzzy length on const Bioseq");
1241     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_const);
1242     eval = validator.Validate(seh, options);
1243     CheckErrors (*eval, expected_errors);
1244 
1245     // shouldn't get fuzzy length if gap
1246     expected_errors[0]->SetErrCode("SeqDataNotFound");
1247     expected_errors[0]->SetErrMsg("Missing Seq-data on constructed Bioseq");
1248     expected_errors[0]->SetSeverity(eDiag_Critical);
1249     entry->SetSeq().SetInst().SetSeq_data().SetGap();
1250     eval = validator.Validate(seh, options);
1251     CheckErrors (*eval, expected_errors);
1252 
1253     CLEAR_ERRORS
1254 }
1255 
1256 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_InvalidAlphabet)1257 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_InvalidAlphabet)
1258 {
1259     CRef<CSeq_entry> prot_entry = unit_test_util::BuildGoodProtSeq();
1260 
1261     CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
1262     CScope scope(*objmgr);
1263     scope.AddDefaults();
1264     CSeq_entry_Handle prot_seh = scope.AddTopLevelSeqEntry(*prot_entry);
1265 
1266     CValidator validator(*objmgr);
1267 
1268     // Set validator options
1269     unsigned int options = CValidator::eVal_need_isojta
1270                           | CValidator::eVal_far_fetch_mrna_products
1271                           | CValidator::eVal_validate_id_set | CValidator::eVal_indexer_version
1272                           | CValidator::eVal_use_entrez;
1273 
1274     // list of expected errors
1275     vector< CExpectedError *> expected_errors;
1276     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidAlphabet", "Using a nucleic acid alphabet on a protein sequence"));
1277     //AddChromosomeNoLocation(expected_errors, "lcl|good");
1278     prot_entry->SetSeq().SetInst().SetSeq_data().SetIupacna();
1279     CConstRef<CValidError> eval = validator.Validate(prot_seh, options);
1280     CheckErrors (*eval, expected_errors);
1281 
1282     prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi2na();
1283     eval = validator.Validate(prot_seh, options);
1284     CheckErrors (*eval, expected_errors);
1285 
1286     prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi4na();
1287     eval = validator.Validate(prot_seh, options);
1288     CheckErrors (*eval, expected_errors);
1289 
1290     prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbi8na();
1291     eval = validator.Validate(prot_seh, options);
1292     CheckErrors (*eval, expected_errors);
1293 
1294     prot_entry->SetSeq().SetInst().SetSeq_data().SetNcbipna();
1295     eval = validator.Validate(prot_seh, options);
1296     CheckErrors (*eval, expected_errors);
1297 
1298     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
1299     CScope scope2(*objmgr);
1300     scope2.AddDefaults();
1301     CSeq_entry_Handle seh = scope2.AddTopLevelSeqEntry(*entry);
1302 
1303     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa();
1304     expected_errors[0]->SetErrMsg("Using a protein alphabet on a nucleic acid");
1305 
1306     eval = validator.Validate(seh, options);
1307     CheckErrors (*eval, expected_errors);
1308 
1309     entry->SetSeq().SetInst().SetSeq_data().SetNcbi8aa();
1310     eval = validator.Validate(seh, options);
1311     CheckErrors (*eval, expected_errors);
1312 
1313     entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa();
1314     eval = validator.Validate(seh, options);
1315     CheckErrors (*eval, expected_errors);
1316 
1317     entry->SetSeq().SetInst().SetSeq_data().SetNcbipaa();
1318     eval = validator.Validate(seh, options);
1319     CheckErrors (*eval, expected_errors);
1320 
1321     entry->SetSeq().SetInst().SetSeq_data().SetNcbistdaa();
1322     eval = validator.Validate(seh, options);
1323     CheckErrors (*eval, expected_errors);
1324 
1325     CLEAR_ERRORS
1326 }
1327 
1328 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_InvalidResidue)1329 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_InvalidResidue)
1330 {
1331     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
1332 
1333     STANDARD_SETUP
1334 
1335     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1336     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1337     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1338     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFB');
1339     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1340     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1341     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFC');
1342     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1343     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1344     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFD');
1345     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFE');
1346     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFE');
1347     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFF');
1348     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('\xFF');
1349     entry->SetSeq().SetInst().SetLength(65);
1350     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'E' at position [5]"));
1351     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'F' at position [6]"));
1352     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'I' at position [9]"));
1353     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'J' at position [10]"));
1354     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'L' at position [12]"));
1355     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'O' at position [15]"));
1356     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'P' at position [16]"));
1357     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Q' at position [17]"));
1358     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'U' at position [21]"));
1359     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'X' at position [24]"));
1360     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Z' at position [26]"));
1361     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'E' at position [31]"));
1362     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'F' at position [32]"));
1363     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'I' at position [35]"));
1364     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'J' at position [36]"));
1365     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'L' at position [38]"));
1366     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'O' at position [41]"));
1367     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'P' at position [42]"));
1368     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Q' at position [43]"));
1369     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'U' at position [47]"));
1370     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'X' at position [50]"));
1371     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid nucleotide residue 'Z' at position [52]"));
1372     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [53]"));
1373     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [54]"));
1374     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [55]"));
1375     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [56]"));
1376     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [57]"));
1377     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [252] at position [58]"));
1378     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [59]"));
1379     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [60]"));
1380     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [253] at position [61]"));
1381     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [254] at position [62]"));
1382     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "More than 10 invalid residues. Checking stopped"));
1383     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Fatal, "NonAsciiAsn", "Non-ASCII character '251' found in item"));
1384     //AddChromosomeNoLocation(expected_errors, "lcl|good");
1385 
1386     eval = validator.Validate(seh, options);
1387     CheckErrors (*eval, expected_errors);
1388 
1389     // now repeat test, but with mRNA - this time Us should not be reported
1390     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
1391     delete expected_errors[8];
1392     expected_errors[8] = NULL;
1393     delete expected_errors[19];
1394     expected_errors[19] = NULL;
1395     eval = validator.Validate(seh, options);
1396     CheckErrors (*eval, expected_errors);
1397 
1398     // now repeat test, but with protein
1399     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_aa);
1400     NON_CONST_ITERATE (CSeq_descr::Tdata, it, entry->SetSeq().SetDescr().Set()) {
1401         if ((*it)->IsMolinfo()) {
1402             (*it)->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
1403         }
1404     }
1405     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1406     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1407     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1408     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFB');
1409     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1410     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1411     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFC');
1412     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1413     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1414     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFD');
1415     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFE');
1416     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFE');
1417     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFF');
1418     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set().push_back('\xFF');
1419     entry->SetSeq().SetInst().SetLength(65);
1420     CRef<CSeq_feat> feat (new CSeq_feat());
1421     feat->SetData().SetProt().SetName().push_back("fake protein name");
1422     feat->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
1423     feat->SetLocation().SetInt().SetFrom(0);
1424     feat->SetLocation().SetInt().SetTo(64);
1425     unit_test_util::AddFeat(feat, entry);
1426     scope.RemoveEntry (*entry);
1427     seh = scope.AddTopLevelSeqEntry(*entry);
1428 
1429     for (int j = 0; j < 22; j++) {
1430         if (expected_errors[j] != NULL) {
1431             delete expected_errors[j];
1432             expected_errors[j] = NULL;
1433         }
1434     }
1435     eval = validator.Validate(seh, options);
1436     CheckErrors (*eval, expected_errors);
1437 
1438     CLEAR_ERRORS
1439 
1440     // now look for lowercase characters
1441     scope.RemoveEntry (*entry);
1442     entry = unit_test_util::BuildGoodSeq();
1443     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("abcdefghijklmnopqrstuvwxyz");
1444     entry->SetSeq().SetInst().SetLength(26);
1445     seh = scope.AddTopLevelSeqEntry(*entry);
1446     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Sequence contains lower-case characters"));
1447     //AddChromosomeNoLocation(expected_errors, "lcl|good");
1448     eval = validator.Validate(seh, options);
1449     CheckErrors (*eval, expected_errors);
1450 
1451     scope.RemoveEntry (*entry);
1452     entry = unit_test_util::BuildGoodProtSeq();
1453     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("protein");
1454     seh = scope.AddTopLevelSeqEntry(*entry);
1455     eval = validator.Validate(seh, options);
1456     CheckErrors (*eval, expected_errors);
1457 
1458 
1459     CLEAR_ERRORS
1460 
1461     // now try delta sequence
1462     scope.RemoveEntry (*entry);
1463     entry = unit_test_util::BuildGoodSeq();
1464     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
1465     entry->SetSeq().SetInst().ResetSeq_data();
1466     CRef<CDelta_seq> seg(new CDelta_seq());
1467     seg->SetLiteral().SetSeq_data().SetIupacna().Set("ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ");
1468     seg->SetLiteral().SetLength(52);
1469     entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(seg);
1470     entry->SetSeq().SetInst().SetLength(52);
1471     seh = scope.AddTopLevelSeqEntry(*entry);
1472 
1473     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [E] at position [5]"));
1474     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [F] at position [6]"));
1475     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [I] at position [9]"));
1476     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [J] at position [10]"));
1477     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [L] at position [12]"));
1478     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [O] at position [15]"));
1479     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [P] at position [16]"));
1480     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Q] at position [17]"));
1481     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [U] at position [21]"));
1482     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [X] at position [24]"));
1483     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Z] at position [26]"));
1484     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [E] at position [31]"));
1485     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [F] at position [32]"));
1486     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [I] at position [35]"));
1487     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [J] at position [36]"));
1488     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [L] at position [38]"));
1489     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [O] at position [41]"));
1490     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [P] at position [42]"));
1491     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Q] at position [43]"));
1492     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [U] at position [47]"));
1493     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [X] at position [50]"));
1494     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [Z] at position [52]"));
1495     //AddChromosomeNoLocation(expected_errors, "lcl|good");
1496     eval = validator.Validate(seh, options);
1497     CheckErrors (*eval, expected_errors);
1498 
1499     CLEAR_ERRORS
1500 
1501     // try protein delta sequence
1502     scope.RemoveEntry (*entry);
1503     entry = unit_test_util::BuildGoodProtSeq();
1504     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
1505     entry->SetSeq().SetInst().ResetSeq_data();
1506     CRef<CDelta_seq> seg2(new CDelta_seq());
1507     seg2->SetLiteral().SetSeq_data().SetIupacaa().Set("1234567");
1508     seg2->SetLiteral().SetLength(7);
1509     entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(seg2);
1510     entry->SetSeq().SetInst().SetLength(7);
1511     seh = scope.AddTopLevelSeqEntry(*entry);
1512 
1513     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [1] at position [1]"));
1514     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [2] at position [2]"));
1515     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [3] at position [3]"));
1516     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [4] at position [4]"));
1517     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [5] at position [5]"));
1518     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [6] at position [6]"));
1519     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue [7] at position [7]"));
1520     //AddChromosomeNoLocation(expected_errors, "lcl|good");
1521 
1522     eval = validator.Validate(seh, options);
1523     CheckErrors (*eval, expected_errors);
1524 
1525     CLEAR_ERRORS
1526 }
1527 
1528 
WriteOutTemp(CRef<CSeq_entry> entry)1529 static void WriteOutTemp (CRef<CSeq_entry> entry)
1530 {
1531     // construct a temp file name
1532     CNcbiOstrstream oss;
1533     oss << "test.asn";
1534     string filename = CNcbiOstrstreamToString(oss);
1535     string fullPath = CDirEntry::MakePath(".", filename);
1536 
1537     // initialize a binary output stream
1538     auto_ptr<CNcbiOstream> outStream;
1539     outStream.reset(new CNcbiOfstream(
1540         fullPath.c_str(),
1541         IOS_BASE::out));
1542     if (!(*outStream)) {
1543         return;
1544     }
1545 
1546     auto_ptr<CObjectOStream> outObject;
1547     // Associate ASN.1 text serialization methods with the input
1548     outObject.reset(new CObjectOStreamAsn(*outStream));
1549 
1550     // write the asn data
1551     try {
1552         *outObject << *entry;
1553         outStream->flush();
1554     } catch (exception& ) {
1555     }
1556 }
1557 
1558 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_StopInProtein)1559 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_StopInProtein)
1560 {
1561     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
1562 
1563     STANDARD_SETUP
1564 
1565     entry->SetSet().SetSeq_set().back()->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MP*K*E*N");
1566     entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("GTGCCCTAAAAATAAGAGTAAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
1567     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
1568     cds->SetExcept(true);
1569     cds->SetExcept_text("unclassified translation discrepancy");
1570 
1571     BOOST_CHECK_EQUAL(validator::HasStopInProtein(*cds, scope), true);
1572     BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
1573 
1574     // list of expected errors
1575     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein", "[3] termination symbols in protein sequence (gene? - fake protein name)"));
1576     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "ExceptionProblem", "unclassified translation discrepancy is not a legal exception explanation"));
1577     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InternalStop", "3 internal stops (and illegal start codon). Genetic code [0]"));
1578     //AddChromosomeNoLocation(expected_errors, "lcl|nuc");
1579 
1580     eval = validator.Validate(seh, options);
1581     CheckErrors (*eval, expected_errors);
1582     WriteOutTemp(entry);
1583 
1584     CLEAR_ERRORS
1585     cds->ResetExcept();
1586     cds->ResetExcept_text();
1587     BOOST_CHECK_EQUAL(validator::HasStopInProtein(*cds, scope), true);
1588     BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
1589     BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
1590 
1591     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein", "[3] termination symbols in protein sequence (gene? - fake protein name)"));
1592     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StartCodon", "Illegal start codon (and 3 internal stops). Probably wrong genetic code [0]"));
1593     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "InternalStop", "3 internal stops (and illegal start codon). Genetic code [0]"));
1594     //AddChromosomeNoLocation(expected_errors, "lcl|nuc");
1595 
1596     eval = validator.Validate(seh, options);
1597     CheckErrors (*eval, expected_errors);
1598     WriteOutTemp(entry);
1599 
1600     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
1601     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCTAAAAATAAGAGTAAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
1602 
1603     // write out seq-entry
1604     WriteOutTemp(entry);
1605 
1606     delete expected_errors[1];
1607     expected_errors[1] = NULL;
1608     expected_errors[2]->SetErrMsg("3 internal stops. Genetic code [0]");
1609     eval = validator.Validate(seh, options);
1610     CheckErrors (*eval, expected_errors);
1611 
1612     CLEAR_ERRORS
1613 }
1614 
1615 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_PartialInconsistent)1616 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_PartialInconsistent)
1617 {
1618 #if 0
1619     //We don't care about segmented sets any more
1620     CRef<CSeq_entry> entry = unit_test_util::BuildGoodProtSeq();
1621 
1622     STANDARD_SETUP
1623 
1624     entry->SetSeq().SetInst().ResetSeq_data();
1625     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
1626     CRef<CSeq_id> id(new CSeq_id("gb|AY123456"));
1627     CRef<CSeq_loc> loc1(new CSeq_loc(*id, 0, 3));
1628     entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(loc1);
1629     CRef<CSeq_id> id2(new CSeq_id("gb|AY123457"));
1630     CRef<CSeq_loc> loc2(new CSeq_loc(*id2, 0, 2));
1631     entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(loc2);
1632 
1633     // list of expected errors
1634     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "PartialInconsistent", "Partial segmented sequence without MolInfo partial"));
1635 
1636     // not-set
1637     loc1->SetPartialStart(true, eExtreme_Biological);
1638     loc2->SetPartialStop(true, eExtreme_Biological);
1639     eval = validator.Validate(seh, options);
1640     CheckErrors (*eval, expected_errors);
1641     loc1->SetPartialStart(true, eExtreme_Biological);
1642     loc2->SetPartialStop(false, eExtreme_Biological);
1643     eval = validator.Validate(seh, options);
1644     CheckErrors (*eval, expected_errors);
1645     loc1->SetPartialStart(false, eExtreme_Biological);
1646     loc2->SetPartialStop(true, eExtreme_Biological);
1647     eval = validator.Validate(seh, options);
1648     CheckErrors (*eval, expected_errors);
1649 
1650     // unknown
1651     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_unknown);
1652 
1653     loc1->SetPartialStart(true, eExtreme_Biological);
1654     loc2->SetPartialStop(true, eExtreme_Biological);
1655     eval = validator.Validate(seh, options);
1656     CheckErrors (*eval, expected_errors);
1657     loc1->SetPartialStart(true, eExtreme_Biological);
1658     loc2->SetPartialStop(false, eExtreme_Biological);
1659     eval = validator.Validate(seh, options);
1660     CheckErrors (*eval, expected_errors);
1661     loc1->SetPartialStart(false, eExtreme_Biological);
1662     loc2->SetPartialStop(true, eExtreme_Biological);
1663     eval = validator.Validate(seh, options);
1664     CheckErrors (*eval, expected_errors);
1665 
1666     // complete
1667     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_complete);
1668 
1669     loc1->SetPartialStart(true, eExtreme_Biological);
1670     loc2->SetPartialStop(true, eExtreme_Biological);
1671     eval = validator.Validate(seh, options);
1672     CheckErrors (*eval, expected_errors);
1673     loc1->SetPartialStart(true, eExtreme_Biological);
1674     loc2->SetPartialStop(false, eExtreme_Biological);
1675     eval = validator.Validate(seh, options);
1676     CheckErrors (*eval, expected_errors);
1677     loc1->SetPartialStart(false, eExtreme_Biological);
1678     loc2->SetPartialStop(true, eExtreme_Biological);
1679     eval = validator.Validate(seh, options);
1680     CheckErrors (*eval, expected_errors);
1681 
1682     // partial
1683     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_partial);
1684 
1685     loc1->SetPartialStart(false, eExtreme_Biological);
1686     loc2->SetPartialStop(false, eExtreme_Biological);
1687     expected_errors[0]->SetErrMsg("Complete segmented sequence with MolInfo partial");
1688     eval = validator.Validate(seh, options);
1689     CheckErrors (*eval, expected_errors);
1690 
1691     // no-left
1692     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_no_left);
1693 
1694     loc1->SetPartialStart(true, eExtreme_Biological);
1695     loc2->SetPartialStop(true, eExtreme_Biological);
1696     expected_errors[0]->SetErrMsg("No-left inconsistent with segmented SeqLoc");
1697     eval = validator.Validate(seh, options);
1698     CheckErrors (*eval, expected_errors);
1699     loc1->SetPartialStart(false, eExtreme_Biological);
1700     loc2->SetPartialStop(true, eExtreme_Biological);
1701     eval = validator.Validate(seh, options);
1702     CheckErrors (*eval, expected_errors);
1703     loc1->SetPartialStart(false, eExtreme_Biological);
1704     loc2->SetPartialStop(false, eExtreme_Biological);
1705     eval = validator.Validate(seh, options);
1706     CheckErrors (*eval, expected_errors);
1707 
1708     // no-right
1709     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_no_right);
1710 
1711     loc1->SetPartialStart(true, eExtreme_Biological);
1712     loc2->SetPartialStop(true, eExtreme_Biological);
1713     expected_errors[0]->SetErrMsg("No-right inconsistent with segmented SeqLoc");
1714     eval = validator.Validate(seh, options);
1715     CheckErrors (*eval, expected_errors);
1716     loc1->SetPartialStart(true, eExtreme_Biological);
1717     loc2->SetPartialStop(false, eExtreme_Biological);
1718     eval = validator.Validate(seh, options);
1719     CheckErrors (*eval, expected_errors);
1720     loc1->SetPartialStart(false, eExtreme_Biological);
1721     loc2->SetPartialStop(false, eExtreme_Biological);
1722     eval = validator.Validate(seh, options);
1723     CheckErrors (*eval, expected_errors);
1724 
1725     // no-ends
1726     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_no_ends);
1727 
1728     expected_errors[0]->SetErrMsg("No-ends inconsistent with segmented SeqLoc");
1729     loc1->SetPartialStart(true, eExtreme_Biological);
1730     loc2->SetPartialStop(false, eExtreme_Biological);
1731     eval = validator.Validate(seh, options);
1732     CheckErrors (*eval, expected_errors);
1733     loc1->SetPartialStart(false, eExtreme_Biological);
1734     loc2->SetPartialStop(true, eExtreme_Biological);
1735     eval = validator.Validate(seh, options);
1736     CheckErrors (*eval, expected_errors);
1737     loc1->SetPartialStart(false, eExtreme_Biological);
1738     loc2->SetPartialStop(false, eExtreme_Biological);
1739     eval = validator.Validate(seh, options);
1740     CheckErrors (*eval, expected_errors);
1741 
1742     CLEAR_ERRORS
1743 #endif
1744 }
1745 
1746 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ShortSeq)1747 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ShortSeq)
1748 {
1749     CRef<CSeq_entry> entry = unit_test_util::BuildGoodProtSeq();
1750 
1751     STANDARD_SETUP
1752 
1753     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPR");
1754     entry->SetSeq().SetInst().SetLength(3);
1755     entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetTo(2);
1756 
1757     // don't report if pdb
1758     CRef<CPDB_seq_id> pdb_id(new CPDB_seq_id());
1759     pdb_id->SetMol().Set("foo");
1760     entry->SetSeq().SetId().front()->SetPdb(*pdb_id);
1761     entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetPdb(*pdb_id);
1762     scope.RemoveTopLevelSeqEntry(seh);
1763     seh = scope.AddTopLevelSeqEntry(*entry);
1764     eval = validator.Validate(seh, options);
1765     //AddChromosomeNoLocation(expected_errors, "pdb|foo| ");
1766     CheckErrors (*eval, expected_errors);
1767 
1768     // new test if no coding region
1769     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PartialsInconsistent", "Molinfo completeness and protein feature partials conflict"));
1770     expected_errors[0]->SetAccession("lcl|good");
1771     entry->SetSeq().SetId().front()->SetLocal().SetStr("good");
1772     entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
1773     scope.RemoveTopLevelSeqEntry(seh);
1774     seh = scope.AddTopLevelSeqEntry(*entry);
1775     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_partial);
1776 
1777     eval = validator.Validate(seh, options);
1778     CheckErrors (*eval, expected_errors);
1779     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_no_left);
1780     eval = validator.Validate(seh, options);
1781     CheckErrors (*eval, expected_errors);
1782     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_no_right);
1783     eval = validator.Validate(seh, options);
1784     CheckErrors (*eval, expected_errors);
1785     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_no_ends);
1786     eval = validator.Validate(seh, options);
1787     CheckErrors (*eval, expected_errors);
1788 
1789     CLEAR_ERRORS
1790 
1791     // for all other completeness, report
1792     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortSeq", "Sequence only 3 residues"));
1793     //AddChromosomeNoLocation(expected_errors, "lcl|good");
1794     NON_CONST_ITERATE (CSeq_descr::Tdata, it, entry->SetSeq().SetDescr().Set()) {
1795         if ((*it)->IsMolinfo()) {
1796             (*it)->SetMolinfo().ResetCompleteness();
1797         }
1798     }
1799     eval = validator.Validate(seh, options);
1800     CheckErrors (*eval, expected_errors);
1801     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_unknown);
1802     eval = validator.Validate(seh, options);
1803     CheckErrors (*eval, expected_errors);
1804     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_complete);
1805     eval = validator.Validate(seh, options);
1806     CheckErrors (*eval, expected_errors);
1807     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_other);
1808     eval = validator.Validate(seh, options);
1809     CheckErrors (*eval, expected_errors);
1810 
1811     // nucleotide
1812     scope.RemoveTopLevelSeqEntry(seh);
1813     entry = unit_test_util::BuildGoodSeq();
1814     seh = scope.AddTopLevelSeqEntry(*entry);
1815     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCTTT");
1816     entry->SetSeq().SetInst().SetLength(9);
1817     expected_errors[0]->SetErrMsg("Sequence only 9 residues");
1818     eval = validator.Validate(seh, options);
1819     CheckErrors (*eval, expected_errors);
1820 
1821     CLEAR_ERRORS
1822 
1823     // don't report if pdb
1824     entry->SetSeq().SetId().front()->SetPdb(*pdb_id);
1825     scope.RemoveTopLevelSeqEntry(seh);
1826     seh = scope.AddTopLevelSeqEntry(*entry);
1827     eval = validator.Validate(seh, options);
1828     //AddChromosomeNoLocation(expected_errors, "pdb|foo| ");
1829     CheckErrors (*eval, expected_errors);
1830 
1831     CLEAR_ERRORS
1832 }
1833 
1834 
IsProteinTech(CMolInfo::TTech tech)1835 static bool IsProteinTech (CMolInfo::TTech tech)
1836 {
1837     bool rval = false;
1838 
1839     switch (tech) {
1840          case CMolInfo::eTech_concept_trans:
1841          case CMolInfo::eTech_seq_pept:
1842          case CMolInfo::eTech_both:
1843          case CMolInfo::eTech_seq_pept_overlap:
1844          case CMolInfo::eTech_seq_pept_homol:
1845          case CMolInfo::eTech_concept_trans_a:
1846              rval = true;
1847              break;
1848          default:
1849              break;
1850     }
1851     return rval;
1852 }
1853 
1854 
AddRefGeneTrackingUserObject(CRef<CSeq_entry> entry)1855 static void AddRefGeneTrackingUserObject(CRef<CSeq_entry> entry)
1856 {
1857     CRef<CSeqdesc> desc(new CSeqdesc());
1858     desc->SetUser().SetObjectType(CUser_object::eObjectType_RefGeneTracking);
1859     desc->SetUser().SetRefGeneTrackingStatus(CUser_object::eRefGeneTrackingStatus_INFERRED);
1860     if (entry->IsSeq()) {
1861         entry->SetSeq().SetDescr().Set().push_back(desc);
1862     } else if (entry->IsSet()) {
1863         entry->SetSet().SetDescr().Set().push_back(desc);
1864     }
1865 }
1866 
1867 
SetRefGeneTrackingStatus(CRef<CSeq_entry> entry,string status)1868 static void SetRefGeneTrackingStatus(CRef<CSeq_entry> entry, string status)
1869 {
1870     if (entry->IsSeq()) {
1871         NON_CONST_ITERATE (CSeq_descr::Tdata, it, entry->SetSeq().SetDescr().Set()) {
1872             if ((*it)->IsUser() && (*it)->GetUser().IsRefGeneTracking()) {
1873                 (*it)->SetUser().SetData().front()->SetData().SetStr(status);
1874             }
1875         }
1876     } else if (entry->IsSet()) {
1877         NON_CONST_ITERATE (CSeq_descr::Tdata, it, entry->SetSet().SetDescr().Set()) {
1878             if ((*it)->IsUser() && (*it)->GetUser().IsRefGeneTracking()) {
1879                 (*it)->SetUser().SetData().front()->SetData().SetStr(status);
1880             }
1881         }
1882     }
1883 }
1884 
1885 
SetTitle(CRef<CSeq_entry> entry,string title)1886 static void SetTitle(CRef<CSeq_entry> entry, string title)
1887 {
1888     bool found = false;
1889 
1890     EDIT_EACH_DESCRIPTOR_ON_SEQENTRY (it, *entry) {
1891         if ((*it)->IsTitle()) {
1892             if (NStr::IsBlank((*it)->GetTitle())) {
1893                 ERASE_DESCRIPTOR_ON_SEQENTRY (it, *entry);
1894             } else {
1895                 (*it)->SetTitle(title);
1896             }
1897             found = true;
1898         }
1899     }
1900     if (!found && !NStr::IsBlank(title)) {
1901         CRef<CSeqdesc> desc(new CSeqdesc());
1902         desc->SetTitle(title);
1903         entry->SetSeq().SetDescr().Set().push_back(desc);
1904     }
1905 }
1906 
1907 
AddGenbankKeyword(CRef<CSeq_entry> entry,string keyword)1908 static void AddGenbankKeyword (CRef<CSeq_entry> entry, string keyword)
1909 {
1910     bool found = false;
1911 
1912     NON_CONST_ITERATE (CSeq_descr::Tdata, it, entry->SetSeq().SetDescr().Set()) {
1913         if ((*it)->IsGenbank()) {
1914             (*it)->SetGenbank().SetKeywords().push_back(keyword);
1915             found = true;
1916         }
1917     }
1918     if (!found) {
1919         CRef<CSeqdesc> desc(new CSeqdesc());
1920         desc->SetGenbank().SetKeywords().push_back(keyword);
1921         entry->SetSeq().SetDescr().Set().push_back(desc);
1922     }
1923 }
1924 
1925 
TestDeltaTechAllowed(CMolInfo::TTech tech)1926 void TestDeltaTechAllowed(CMolInfo::TTech tech)
1927 {
1928     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
1929     STANDARD_SETUP
1930 
1931     SetTech(entry, tech);
1932     eval = validator.Validate(seh, options);
1933     if (tech == CMolInfo::eTech_barcode) {
1934         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NoKeywordHasTechnique", "Molinfo.tech barcode without BARCODE keyword"));
1935     } else if (tech == CMolInfo::eTech_tsa) {
1936         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TSAseqGapProblem", "TSA Seq_gap NULL"));
1937         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
1938         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"genomic\" is not appropriate for sequences that use the TSA technique."));
1939         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAseqGapProblem", "TSA submission includes wrong gap type. Gaps for TSA should be Assembly Gaps with linkage evidence."));
1940     } else if (tech == CMolInfo::eTech_wgs) {
1941         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
1942     }
1943     if (tech == CMolInfo::eTech_wgs) {
1944         AddChromosomeNoLocation(expected_errors, "lcl|good");
1945     }
1946 
1947     CheckErrors(*eval, expected_errors);
1948 
1949     CLEAR_ERRORS
1950 }
1951 
1952 
TestDeltaTechNotAllowed(CMolInfo::TTech tech)1953 void TestDeltaTechNotAllowed(CMolInfo::TTech tech)
1954 {
1955     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
1956     STANDARD_SETUP
1957 
1958     SetTech(entry, tech);
1959     eval = validator.Validate(seh, options);
1960     if (IsProteinTech(tech)) {
1961         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide", "Nucleic acid with protein sequence method"));
1962     } else if (tech == CMolInfo::eTech_est) {
1963         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ESTshouldBemRNA", "EST sequence should be mRNA"));
1964     }
1965     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDeltaSeq", "Delta seq technique should not be [" + NStr::UIntToString(tech) + "]"));
1966     //AddChromosomeNoLocation(expected_errors, "lcl|good");
1967     eval = validator.Validate(seh, options);
1968     CheckErrors(*eval, expected_errors);
1969     CLEAR_ERRORS
1970 }
1971 
1972 
TestStartGapSeg(CMolInfo::TTech tech)1973 void TestStartGapSeg(CMolInfo::TTech tech)
1974 {
1975     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
1976 
1977     STANDARD_SETUP
1978 
1979     CRef<CDelta_seq> start_gap_seg(new CDelta_seq());
1980     start_gap_seg->SetLiteral().SetLength(10);
1981     start_gap_seg->SetLiteral().SetSeq_data().SetGap();
1982     entry->SetSeq().SetInst().SetExt().SetDelta().Set().insert(entry->SetSeq().SetInst().SetExt().SetDelta().Set().begin(), start_gap_seg);
1983     entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
1984     entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
1985     entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("AAATTTGGGC", CSeq_inst::eMol_dna);
1986     CRef<CDelta_seq> end_gap_seg(new CDelta_seq());
1987     end_gap_seg->SetLiteral().SetLength(10);
1988     end_gap_seg->SetLiteral().SetSeq_data().SetGap();
1989     entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(end_gap_seg);
1990     entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral(10);
1991     entry->SetSeq().SetInst().SetLength(94);
1992     SetTech(entry, tech);
1993     if (tech == CMolInfo::eTech_wgs) {
1994         AddChromosomeNoLocation(expected_errors, "lcl|good");
1995     }
1996     // expected_errors.push_back(new CExpectedError("lcl|good", tech == CMolInfo::eTech_wgs ? eDiag_Warning : eDiag_Error, "BadDeltaSeq", "First delta seq component is a gap"));
1997     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDeltaSeq", "There is 1 adjacent gap in delta seq"));
1998     // expected_errors.push_back(new CExpectedError("lcl|good", tech == CMolInfo::eTech_wgs ? eDiag_Warning : eDiag_Error, "BadDeltaSeq", "Last delta seq component is a gap"));
1999     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at beginning of sequence"));
2000     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at end of sequence"));
2001     /*
2002     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
2003         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
2004     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
2005         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
2006     */
2007     if (tech == CMolInfo::eTech_wgs) {
2008         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
2009     }
2010     eval = validator.Validate(seh, options);
2011     CheckErrors(*eval, expected_errors);
2012 
2013     CLEAR_ERRORS
2014 }
2015 
2016 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_BadDeltaSeq)2017 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_BadDeltaSeq)
2018 {
2019     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
2020 
2021     STANDARD_SETUP
2022 
2023     NON_CONST_ITERATE (CSeq_descr::Tdata, it, entry->SetSeq().SetDescr().Set()) {
2024         if ((*it)->IsMolinfo()) {
2025             (*it)->SetMolinfo().SetTech(CMolInfo::eTech_derived);
2026         }
2027     }
2028 
2029     // don't report if NT or NC
2030     scope.RemoveTopLevelSeqEntry(seh);
2031     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
2032     seh = scope.AddTopLevelSeqEntry(*entry);
2033     eval = validator.Validate(seh, options);
2034     //AddChromosomeNoLocation(expected_errors, "ref|NC_123456|");
2035     CheckErrors (*eval, expected_errors);
2036     CLEAR_ERRORS
2037 
2038     entry->SetSeq().SetId().front()->SetOther().SetAccession("NT_123456");
2039     scope.RemoveTopLevelSeqEntry(seh);
2040     seh = scope.AddTopLevelSeqEntry(*entry);
2041     eval = validator.Validate(seh, options);
2042     //AddChromosomeNoLocation(expected_errors, "ref|NT_123456|");
2043     CheckErrors (*eval, expected_errors);
2044     CLEAR_ERRORS
2045 
2046     // don't report if gen-prod-set
2047 
2048     entry->SetSeq().SetId().front()->SetLocal().SetStr("good");
2049     scope.RemoveTopLevelSeqEntry(seh);
2050     seh = scope.AddTopLevelSeqEntry(*entry);
2051 
2052     // allowed tech values
2053     vector<CMolInfo::TTech> allowed_list;
2054     allowed_list.push_back(CMolInfo::eTech_htgs_0);
2055     allowed_list.push_back(CMolInfo::eTech_htgs_1);
2056     allowed_list.push_back(CMolInfo::eTech_htgs_2);
2057     allowed_list.push_back(CMolInfo::eTech_htgs_3);
2058     allowed_list.push_back(CMolInfo::eTech_wgs);
2059     allowed_list.push_back(CMolInfo::eTech_composite_wgs_htgs);
2060     allowed_list.push_back(CMolInfo::eTech_unknown);
2061     allowed_list.push_back(CMolInfo::eTech_standard);
2062     allowed_list.push_back(CMolInfo::eTech_htc);
2063     allowed_list.push_back(CMolInfo::eTech_barcode);
2064     allowed_list.push_back(CMolInfo::eTech_tsa);
2065 
2066     for (CMolInfo::TTech i = CMolInfo::eTech_unknown;
2067          i <= CMolInfo::eTech_tsa;
2068          i++) {
2069          bool allowed = false;
2070          for (vector<CMolInfo::TTech>::iterator it = allowed_list.begin();
2071               it != allowed_list.end() && !allowed;
2072               ++it) {
2073               if (*it == i) {
2074                   allowed = true;
2075               }
2076          }
2077          if (allowed) {
2078              // don't report for htgs_0
2079              TestDeltaTechAllowed(i);
2080          } else {
2081              TestDeltaTechNotAllowed(i);
2082          }
2083     }
2084 
2085 
2086     CLEAR_ERRORS
2087 
2088     TestStartGapSeg(CMolInfo::eTech_wgs);
2089     TestStartGapSeg(CMolInfo::eTech_htgs_0);
2090 
2091     CLEAR_ERRORS
2092 }
2093 
2094 
AdjustGap(CSeq_gap & gap,CSeq_gap::EType gap_type,bool is_linked,vector<CLinkage_evidence::EType> linkage_evidence)2095 void AdjustGap(CSeq_gap& gap, CSeq_gap::EType gap_type, bool is_linked, vector<CLinkage_evidence::EType> linkage_evidence)
2096 {
2097     gap.Reset();
2098     gap.SetType(gap_type);
2099     if (is_linked) {
2100         gap.SetLinkage(CSeq_gap::eLinkage_linked);
2101     } else {
2102         gap.ResetLinkage();
2103     }
2104     gap.ResetLinkage_evidence();
2105     for (auto it : linkage_evidence) {
2106         CRef<CLinkage_evidence> ev(new CLinkage_evidence());
2107         ev->SetType(it);
2108         gap.SetLinkage_evidence().push_back(ev);
2109     }
2110 }
2111 
2112 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_SeqGapBadLinkage)2113 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_SeqGapBadLinkage)
2114 {
2115     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
2116 
2117     vector<CLinkage_evidence::EType> evidence;
2118     evidence.push_back(CLinkage_evidence::eType_align_genus);
2119     for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2120         if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data() &&
2121             it->GetLiteral().GetSeq_data().IsGap()) {
2122             AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2123                 CSeq_gap::eType_short_arm, true, evidence);
2124         }
2125     }
2126 
2127     STANDARD_SETUP
2128 
2129     //AddChromosomeNoLocation(expected_errors, "lcl|good");
2130     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
2131             "SeqGapBadLinkage", "Seq-gap of type 3 should not have linkage evidence"));
2132 
2133     eval = validator.Validate(seh, options);
2134     CheckErrors (*eval, expected_errors);
2135 
2136     CLEAR_ERRORS
2137 
2138     scope.RemoveTopLevelSeqEntry(seh);
2139     for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2140         if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data() &&
2141             it->GetLiteral().GetSeq_data().IsGap()) {
2142             CSeq_gap& gap = it->SetLiteral().SetSeq_data().SetGap();
2143             gap.ResetLinkage();
2144             gap.ResetType();
2145         }
2146     }
2147     seh = scope.AddTopLevelSeqEntry(*entry);
2148 
2149     //AddChromosomeNoLocation(expected_errors, "lcl|good");
2150     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
2151             "SeqGapBadLinkage", "Seq-gap with linkage evidence must have linkage field set to linked"));
2152 
2153     eval = validator.Validate(seh, options);
2154     CheckErrors (*eval, expected_errors);
2155 
2156     CLEAR_ERRORS
2157 
2158     scope.RemoveTopLevelSeqEntry(seh);
2159     evidence.push_back(CLinkage_evidence::eType_align_genus);
2160     for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2161         if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data() &&
2162             it->GetLiteral().GetSeq_data().IsGap()) {
2163             AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2164                 CSeq_gap::eType_fragment, true, evidence);
2165         }
2166     }
2167     seh = scope.AddTopLevelSeqEntry(*entry);
2168 
2169     //AddChromosomeNoLocation(expected_errors, "lcl|good");
2170     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
2171             "SeqGapBadLinkage", "Linkage evidence 'align genus' appears 2 times"));
2172 
2173     eval = validator.Validate(seh, options);
2174     CheckErrors (*eval, expected_errors);
2175 
2176     CLEAR_ERRORS
2177 
2178     evidence.pop_back();
2179     evidence.push_back(CLinkage_evidence::eType_unspecified);
2180     scope.RemoveTopLevelSeqEntry(seh);
2181     for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2182         if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data() &&
2183             it->GetLiteral().GetSeq_data().IsGap()) {
2184             AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2185                 CSeq_gap::eType_fragment, true, evidence);
2186         }
2187     }
2188     seh = scope.AddTopLevelSeqEntry(*entry);
2189 
2190     //AddChromosomeNoLocation(expected_errors, "lcl|good");
2191     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
2192             "SeqGapBadLinkage", "Seq-gap type has unspecified and additional linkage evidence"));
2193 
2194     eval = validator.Validate(seh, options);
2195     CheckErrors (*eval, expected_errors);
2196 
2197     CLEAR_ERRORS
2198 
2199     scope.RemoveTopLevelSeqEntry(seh);
2200     evidence.clear();
2201     evidence.push_back(CLinkage_evidence::eType_unspecified);
2202     for (auto it : entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
2203         if (it->IsLiteral() && it->GetLiteral().IsSetSeq_data() &&
2204             it->GetLiteral().GetSeq_data().IsGap()) {
2205             AdjustGap(it->SetLiteral().SetSeq_data().SetGap(),
2206                 CSeq_gap::eType_unknown, true, evidence);
2207         }
2208     }
2209     seh = scope.AddTopLevelSeqEntry(*entry);
2210 
2211     //AddChromosomeNoLocation(expected_errors, "lcl|good");
2212     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
2213             "SeqGapBadLinkage", "Single Seq-gap has unknown type and unspecified linkage"));
2214 
2215     eval = validator.Validate(seh, options);
2216     CheckErrors (*eval, expected_errors);
2217 
2218     CLEAR_ERRORS
2219 
2220     scope.RemoveTopLevelSeqEntry(seh);
2221     CRef<objects::CDelta_seq> gap_seg(new objects::CDelta_seq());
2222     gap_seg->SetLiteral().SetLength(10);
2223     AdjustGap(gap_seg->SetLiteral().SetSeq_data().SetGap(),
2224                 CSeq_gap::eType_unknown, true, evidence);
2225 
2226     // adjust delta to avoid errors about large number of Ns in first and last 50 bp
2227     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("CCCATGATGATGTACCGTACGTTTTCCCATGATGATGTACCGTACGTTTT");
2228     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetLength(50);
2229     entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
2230     entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGATGTACCGTACGTTTTCCCATGATGATGTACCGTACGTTTT", objects::CSeq_inst::eMol_dna);
2231     entry->SetSeq().SetInst().SetLength(132);
2232 
2233     seh = scope.AddTopLevelSeqEntry(*entry);
2234 
2235     //AddChromosomeNoLocation(expected_errors, "lcl|good");
2236     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
2237             "SeqGapBadLinkage", "All 2 Seq-gaps have unknown type and unspecified linkage"));
2238 
2239     eval = validator.Validate(seh, options);
2240     CheckErrors (*eval, expected_errors);
2241 
2242     CLEAR_ERRORS
2243 }
2244 
2245 
ChangeErrorAcc(vector<CExpectedError * > expected_errors,const string & acc)2246 void ChangeErrorAcc(vector<CExpectedError *> expected_errors, const string& acc)
2247 {
2248     for (auto it : expected_errors) {
2249         if (it) {
2250             it->SetAccession(acc);
2251         }
2252     }
2253 }
2254 
2255 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ConflictingIdsOnBioseq)2256 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ConflictingIdsOnBioseq)
2257 {
2258     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
2259 
2260     STANDARD_SETUP
2261 
2262     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (lcl|good - lcl|bad)"));
2263     //AddChromosomeNoLocation(expected_errors, "lcl|good");
2264 
2265     // local IDs
2266     scope.RemoveTopLevelSeqEntry(seh);
2267     CRef<CSeq_id> id2(new CSeq_id());
2268     id2->SetLocal().SetStr("bad");
2269     entry->SetSeq().SetId().push_back(id2);
2270     seh = scope.AddTopLevelSeqEntry(*entry);
2271     eval = validator.Validate(seh, options);
2272     CheckErrors (*eval, expected_errors);
2273 
2274     // GIBBSQ
2275     scope.RemoveTopLevelSeqEntry(seh);
2276     CRef<CSeq_id> id1 = entry->SetSeq().SetId().front();
2277     id1->SetGibbsq(1);
2278     id2->SetGibbsq(2);
2279     seh = scope.AddTopLevelSeqEntry(*entry);
2280     ChangeErrorAcc(expected_errors, "bbs|1");
2281     expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (bbs|1 - bbs|2)");
2282     eval = validator.Validate(seh, options);
2283     CheckErrors (*eval, expected_errors);
2284 
2285     // GIBBSQ
2286     scope.RemoveTopLevelSeqEntry(seh);
2287     id1->SetGibbmt(1);
2288     id2->SetGibbmt(2);
2289     seh = scope.AddTopLevelSeqEntry(*entry);
2290     ChangeErrorAcc(expected_errors, "bbm|1");
2291     expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (bbm|1 - bbm|2)");
2292     eval = validator.Validate(seh, options);
2293     CheckErrors (*eval, expected_errors);
2294 
2295     // GI
2296     scope.RemoveTopLevelSeqEntry(seh);
2297     id1->SetGi(GI_CONST(1));
2298     id2->SetGi(GI_CONST(2));
2299     CRef<CSeq_id> id3(new CSeq_id("gb|AY123456.1"));
2300     entry->SetSeq().SetId().push_back (id3);
2301     seh = scope.AddTopLevelSeqEntry(*entry);
2302     ChangeErrorAcc(expected_errors, "gb|AY123456.1|");
2303     expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (gi|1 - gi|2)");
2304     eval = validator.Validate(seh, options);
2305     CheckErrors (*eval, expected_errors);
2306     entry->SetSeq().SetId().pop_back();
2307 
2308     // GIIM
2309     scope.RemoveTopLevelSeqEntry(seh);
2310     id1->SetGiim().SetId(1);
2311     id1->SetGiim().SetDb("foo");
2312     id2->SetGiim().SetId(2);
2313     id2->SetGiim().SetDb("foo");
2314     seh = scope.AddTopLevelSeqEntry(*entry);
2315     CLEAR_ERRORS
2316 
2317     expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "IdOnMultipleBioseqs", "BioseqFind (gim|1) unable to find itself - possible internal error"));
2318     expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gim|1 - gim|2)"));
2319     expected_errors.push_back(new CExpectedError("gim|1", eDiag_Error, "IdOnMultipleBioseqs", "BioseqFind (gim|2) unable to find itself - possible internal error"));
2320     //AddChromosomeNoLocation(expected_errors, "gim|1");
2321     eval = validator.Validate(seh, options);
2322     CheckErrors (*eval, expected_errors);
2323     CLEAR_ERRORS
2324 
2325     // patent
2326     scope.RemoveTopLevelSeqEntry(seh);
2327     id1->SetPatent().SetSeqid(1);
2328     id1->SetPatent().SetCit().SetCountry("USA");
2329     id1->SetPatent().SetCit().SetId().SetNumber("1");
2330     id2->SetPatent().SetSeqid(2);
2331     id2->SetPatent().SetCit().SetCountry("USA");
2332     id2->SetPatent().SetCit().SetId().SetNumber("2");
2333     seh = scope.AddTopLevelSeqEntry(*entry);
2334     expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (pat|USA|1|1 - pat|USA|2|2)"));
2335     //AddChromosomeNoLocation(expected_errors, "pat|USA|1|1");
2336     eval = validator.Validate(seh, options);
2337     CheckErrors (*eval, expected_errors);
2338 
2339     // pdb
2340     scope.RemoveTopLevelSeqEntry(seh);
2341     id1->SetPdb().SetMol().Set("good");
2342     id2->SetPdb().SetMol().Set("badd");
2343     seh = scope.AddTopLevelSeqEntry(*entry);
2344     ChangeErrorAcc(expected_errors, "pdb|good| ");
2345     expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (pdb|good|  - pdb|badd| )");
2346     eval = validator.Validate(seh, options);
2347     CheckErrors (*eval, expected_errors);
2348 
2349     // general
2350     scope.RemoveTopLevelSeqEntry(seh);
2351     id1->SetGeneral().SetDb("a");
2352     id1->SetGeneral().SetTag().SetStr("good");
2353     id2->SetGeneral().SetDb("a");
2354     id2->SetGeneral().SetTag().SetStr("bad");
2355     seh = scope.AddTopLevelSeqEntry(*entry);
2356     ChangeErrorAcc(expected_errors, "gnl|a|good");
2357     expected_errors[0]->SetErrMsg("Conflicting ids on a Bioseq: (gnl|a|good - gnl|a|bad)");
2358     eval = validator.Validate(seh, options);
2359     CheckErrors (*eval, expected_errors);
2360 
2361     CLEAR_ERRORS
2362     // should get no error if db values are different
2363     scope.RemoveTopLevelSeqEntry(seh);
2364     id2->SetGeneral().SetDb("b");
2365     seh = scope.AddTopLevelSeqEntry(*entry);
2366     //AddChromosomeNoLocation(expected_errors, "gnl|a|good");
2367     eval = validator.Validate(seh, options);
2368     CheckErrors (*eval, expected_errors);
2369 
2370     // genbank
2371     scope.RemoveTopLevelSeqEntry(seh);
2372     expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gb|AY222222|)"));
2373     id1->SetGenbank().SetAccession("AY123456");
2374     id2->SetGenbank().SetAccession("AY222222");
2375     seh = scope.AddTopLevelSeqEntry(*entry);
2376     eval = validator.Validate(seh, options);
2377     CheckErrors (*eval, expected_errors);
2378 
2379     // try genbank with accession same, versions different
2380     scope.RemoveTopLevelSeqEntry(seh);
2381     id2->SetGenbank().SetAccession("AY123456");
2382     id2->SetGenbank().SetVersion(2);
2383     seh = scope.AddTopLevelSeqEntry(*entry);
2384     CLEAR_ERRORS
2385     //AddChromosomeNoLocation(expected_errors, "gb|AY123456.2|");
2386     expected_errors.push_back(new CExpectedError("gb|AY123456.2|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gb|AY123456.2|)"));
2387     eval = validator.Validate(seh, options);
2388     CheckErrors (*eval, expected_errors);
2389 
2390     // try similar id type
2391     scope.RemoveTopLevelSeqEntry(seh);
2392     id2->SetGpipe().SetAccession("AY123456");
2393     seh = scope.AddTopLevelSeqEntry(*entry);
2394     CLEAR_ERRORS
2395     expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "ConflictingIdsOnBioseq", "Conflicting ids on a Bioseq: (gb|AY123456| - gpp|AY123456|)"));
2396     //AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
2397     eval = validator.Validate(seh, options);
2398     CheckErrors (*eval, expected_errors);
2399 
2400     // LRG
2401     scope.RemoveTopLevelSeqEntry(seh);
2402     id1->SetGeneral().SetDb("LRG");
2403     id1->SetGeneral().SetTag().SetStr("good");
2404     seh = scope.AddTopLevelSeqEntry(*entry);
2405     ChangeErrorAcc(expected_errors, "gpp|AY123456|");
2406     expected_errors[0]->SetErrMsg("LRG sequence needs NG_ accession");
2407     expected_errors[0]->SetSeverity(eDiag_Critical);
2408     eval = validator.Validate(seh, options);
2409     CheckErrors (*eval, expected_errors);
2410     // no error if has NG
2411     scope.RemoveTopLevelSeqEntry(seh);
2412     id2->SetOther().SetAccession("NG_123456");
2413     seh = scope.AddTopLevelSeqEntry(*entry);
2414     CLEAR_ERRORS
2415     //AddChromosomeNoLocation(expected_errors, "ref|NG_123456|");
2416     eval = validator.Validate(seh, options);
2417     CheckErrors (*eval, expected_errors);
2418 
2419     CLEAR_ERRORS
2420 }
2421 
2422 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_MolNuclAcid)2423 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_MolNuclAcid)
2424 {
2425     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
2426 
2427     STANDARD_SETUP
2428 
2429     //AddChromosomeNoLocation(expected_errors, "lcl|good");
2430     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MolNuclAcid", "Bioseq.mol is type nucleic acid"));
2431 
2432     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_na);
2433     eval = validator.Validate(seh, options);
2434     CheckErrors (*eval, expected_errors);
2435 
2436     CLEAR_ERRORS
2437 }
2438 
2439 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ConflictingBiomolTech)2440 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ConflictingBiomolTech)
2441 {
2442     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
2443 
2444     STANDARD_SETUP
2445 
2446     // allowed tech values
2447     vector<CMolInfo::TTech> genomic_list;
2448     genomic_list.push_back(CMolInfo::eTech_sts);
2449     genomic_list.push_back(CMolInfo::eTech_survey);
2450     genomic_list.push_back(CMolInfo::eTech_wgs);
2451     genomic_list.push_back(CMolInfo::eTech_htgs_0);
2452     genomic_list.push_back(CMolInfo::eTech_htgs_1);
2453     genomic_list.push_back(CMolInfo::eTech_htgs_2);
2454     genomic_list.push_back(CMolInfo::eTech_htgs_3);
2455     genomic_list.push_back(CMolInfo::eTech_composite_wgs_htgs);
2456 
2457     for (CMolInfo::TTech i = CMolInfo::eTech_unknown;
2458          i <= CMolInfo::eTech_tsa;
2459         i++) {
2460         bool genomic = false;
2461         for (vector<CMolInfo::TTech>::iterator it = genomic_list.begin();
2462               it != genomic_list.end() && !genomic;
2463               ++it) {
2464             if (*it == i) {
2465                 genomic = true;
2466             }
2467         }
2468         entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
2469         SetTech (entry, i);
2470         unit_test_util::SetBiomol (entry, CMolInfo::eBiomol_cRNA);
2471         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolType", "Molecule type (DNA) does not match biomol (RNA)"));
2472         if (i == CMolInfo::eTech_wgs) {
2473             AddChromosomeNoLocation(expected_errors, "lcl|good");
2474         }
2475         if (i == CMolInfo::eTech_est) {
2476             expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ESTshouldBemRNA", "EST sequence should be mRNA"));
2477         }
2478         if (i == CMolInfo::eTech_htgs_2) {
2479             expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 raw seq has no gaps and no graphs"));
2480         }
2481         if (genomic) {
2482             expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "HTGS_STS_GSS_WGSshouldBeGenomic", "HTGS/STS/GSS/WGS sequence should be genomic"));
2483             eval = validator.Validate(seh, options);
2484             CheckErrors (*eval, expected_errors);
2485             unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_genomic);
2486             entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
2487             delete expected_errors[0];
2488             expected_errors[0] = NULL;
2489             expected_errors.back()->SetErrCode("HTGS_STS_GSS_WGSshouldNotBeRNA");
2490             expected_errors.back()->SetErrMsg("HTGS/STS/GSS/WGS sequence should not be RNA");
2491             eval = validator.Validate(seh, options);
2492             CheckErrors (*eval, expected_errors);
2493         } else {
2494             if (IsProteinTech(i)) {
2495                 expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide", "Nucleic acid with protein sequence method"));
2496             }
2497             if (i == CMolInfo::eTech_barcode) {
2498                 expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NoKeywordHasTechnique", "Molinfo.tech barcode without BARCODE keyword"));
2499             } else if (i == CMolInfo::eTech_tsa) {
2500                 expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2501                 expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"cRNA\" is not appropriate for sequences that use the TSA technique."));
2502             }
2503             eval = validator.Validate(seh, options);
2504             CheckErrors (*eval, expected_errors);
2505         }
2506         CLEAR_ERRORS
2507     }
2508 
2509     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
2510     SetTech (entry, CMolInfo::eTech_tsa);
2511     //AddChromosomeNoLocation(expected_errors, "lcl|good");
2512     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolType", "Molecule type (DNA) does not match biomol (RNA)"));
2513     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2514     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WrongBiomolForTSA", "Biomol \"cRNA\" is not appropriate for sequences that use the TSA technique."));
2515     eval = validator.Validate(seh, options);
2516     CheckErrors (*eval, expected_errors);
2517 
2518     CLEAR_ERRORS
2519 
2520     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TSAshouldBNotBeDNA", "TSA sequence should not be DNA"));
2521     eval = validator.GetTSAConflictingBiomolTechErrors(seh);
2522     CheckErrors (*eval, expected_errors);
2523     eval = validator.GetTSAConflictingBiomolTechErrors(*(seh.GetSeq().GetCompleteBioseq()));
2524     CheckErrors (*eval, expected_errors);
2525     CLEAR_ERRORS
2526 }
2527 
2528 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_SeqIdNameHasSpace)2529 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_SeqIdNameHasSpace)
2530 {
2531     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
2532     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
2533     entry->SetSeq().SetId().front()->SetOther().SetName("good one");
2534 
2535     STANDARD_SETUP
2536 
2537     expected_errors.push_back(new CExpectedError("ref|NC_123456|good one", eDiag_Critical, "SeqIdNameHasSpace", "Seq-id.name 'good one' should be a single word without any spaces"));
2538     //AddChromosomeNoLocation(expected_errors, "ref|NC_123456|good one");
2539 
2540     eval = validator.Validate(seh, options);
2541     CheckErrors (*eval, expected_errors);
2542 
2543     CLEAR_ERRORS
2544 }
2545 
2546 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_DuplicateSegmentReferences)2547 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_DuplicateSegmentReferences)
2548 {
2549 #if 0
2550     // removed per VR-779
2551     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
2552     entry->SetSeq().SetInst().ResetSeq_data();
2553     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
2554     CRef<CSeq_loc> seg1 (new CSeq_loc());
2555     seg1->SetWhole().SetGenbank().SetAccession("AY123456");
2556     entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(seg1);
2557     CRef<CSeq_loc> seg2 (new CSeq_loc());
2558     seg2->SetWhole().SetGenbank().SetAccession("AY123456");
2559     entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(seg2);
2560     entry->SetSeq().SetInst().SetLength(970);
2561 
2562     CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
2563     // need to call this statement before calling AddDefaults
2564     // to make sure that we can fetch the sequence referenced by the
2565     // delta sequence so that we can detect that the loc in the
2566     // delta sequence is longer than the referenced sequence
2567     CGBDataLoader::RegisterInObjectManager(*objmgr);
2568     CScope scope(*objmgr);
2569     scope.AddDefaults();
2570     CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
2571 
2572     CValidator validator(*objmgr);
2573 
2574     // Set validator options
2575     unsigned int options = CValidator::eVal_need_isojta
2576                           | CValidator::eVal_far_fetch_mrna_products
2577                           | CValidator::eVal_validate_id_set | CValidator::eVal_indexer_version
2578                           | CValidator::eVal_use_entrez;
2579 
2580     // list of expected errors
2581     vector< CExpectedError *> expected_errors;
2582     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLocOrder", "Segmented BioseqIntervals out of order in SeqLoc [[gb|AY123456|, gb|AY123456|]]"));
2583     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DuplicateSegmentReferences", "Segmented sequence has multiple references to gb|AY123456"));
2584     CConstRef<CValidError> eval;
2585 
2586     eval = validator.Validate(seh, options);
2587     CheckErrors (*eval, expected_errors);
2588 
2589     seg2->SetInt().SetId().SetGenbank().SetAccession("AY123456");
2590     seg2->SetInt().SetFrom(0);
2591     seg2->SetInt().SetTo(484);
2592     expected_errors[0]->SetErrMsg("Segmented BioseqIntervals out of order in SeqLoc [[gb|AY123456|, 1-485]]");
2593     expected_errors[1]->SetSeverity(eDiag_Warning);
2594     expected_errors[1]->SetErrMsg("Segmented sequence has multiple references to gb|AY123456 that are not SEQLOC_WHOLE");
2595     eval = validator.Validate(seh, options);
2596     CheckErrors (*eval, expected_errors);
2597 
2598     CLEAR_ERRORS
2599 #endif
2600 }
2601 
2602 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_TrailingX)2603 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_TrailingX)
2604 {
2605     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
2606     CRef<CSeq_entry> nuc = entry->SetSet().SetSeq_set().front();
2607     CRef<CSeq_entry> prot = entry->SetSet().SetSeq_set().back();
2608     CRef<CSeq_feat> prot_feat = prot->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
2609     CRef<CSeq_feat> cds_feat = unit_test_util::GetCDSFromGoodNucProtSet(entry);
2610     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATANNNNNN");
2611     nuc->SetSeq().SetInst().SetLength(27);
2612     prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKTEIXX");
2613     prot->SetSeq().SetInst().SetLength(9);
2614     unit_test_util::SetCompleteness (prot, CMolInfo::eCompleteness_no_right);
2615     prot_feat->SetLocation().SetInt().SetTo(8);
2616     prot_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
2617     prot_feat->SetPartial(true);
2618     cds_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
2619     cds_feat->SetPartial(true);
2620 
2621     STANDARD_SETUP
2622 
2623     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TerminalNs", "N at end of sequence"));
2624     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "TrailingX", "Sequence ends in 2 trailing Xs"));
2625     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "HighNpercent3Prime",
2626         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
2627     //AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2628 
2629     eval = validator.Validate(seh, options);
2630     CheckErrors (*eval, expected_errors);
2631 
2632     CLEAR_ERRORS
2633 
2634 }
2635 
2636 
TestBadProtId(const string & id_str)2637 void TestBadProtId(const string& id_str)
2638 {
2639     // bad for just prots
2640     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
2641     CRef<CSeq_id> bad_id(new CSeq_id());
2642     bad_id->SetGenbank().SetAccession(id_str);
2643     CRef<CSeq_id> good_nuc_id(new CSeq_id());
2644     good_nuc_id->SetLocal().SetStr("nuc");
2645     CRef<CSeq_id> good_prot_id(new CSeq_id());
2646     good_prot_id->SetLocal().SetStr("prot");
2647 
2648     unit_test_util::ChangeNucId(entry, good_nuc_id);
2649     unit_test_util::ChangeProtId(entry, bad_id);
2650 
2651     STANDARD_SETUP
2652 
2653     expected_errors.push_back(new CExpectedError("gb|" + id_str + "|", eDiag_Error, "BadSeqIdFormat", "Bad accession " + id_str));
2654     //AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2655 
2656     eval = validator.Validate(seh, options);
2657     CheckErrors(*eval, expected_errors);
2658     CLEAR_ERRORS
2659 }
2660 
2661 
TestGoodProtId(const string & id_str)2662 void TestGoodProtId(const string& id_str)
2663 {
2664     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
2665     CRef<CSeq_id> bad_id(new CSeq_id());
2666     bad_id->SetGenbank().SetAccession(id_str);
2667     CRef<CSeq_id> good_nuc_id(new CSeq_id());
2668     good_nuc_id->SetLocal().SetStr("nuc");
2669     CRef<CSeq_id> good_prot_id(new CSeq_id());
2670     good_prot_id->SetLocal().SetStr("prot");
2671 
2672     unit_test_util::ChangeNucId(entry, good_nuc_id);
2673     unit_test_util::ChangeProtId(entry, bad_id);
2674 
2675     STANDARD_SETUP
2676 
2677     eval = validator.Validate(seh, options);
2678     //AddChromosomeNoLocation(expected_errors, "lcl|nuc");
2679 
2680     CheckErrors(*eval, expected_errors);
2681     CLEAR_ERRORS
2682 }
2683 
2684 
TestGoodNucId(const string & id_str)2685 void TestGoodNucId(const string& id_str)
2686 {
2687     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
2688     CRef<CSeq_id> bad_id(new CSeq_id());
2689     bad_id->SetGenbank().SetAccession(id_str);
2690     CRef<CSeq_id> good_prot_id(new CSeq_id());
2691     good_prot_id->SetLocal().SetStr("prot");
2692     unit_test_util::ChangeNucId(entry, bad_id);
2693     unit_test_util::ChangeProtId(entry, good_prot_id);
2694     bool is_wgs = false;
2695     if (id_str.length() == 12 || id_str.length() == 13 || id_str.length() == 14 || id_str.length() == 15) {
2696         SetTech(entry->SetSet().SetSeq_set().front(), CMolInfo::eTech_wgs);
2697         is_wgs = true;
2698     }
2699 
2700     STANDARD_SETUP
2701 
2702     if (is_wgs) {
2703         AddChromosomeNoLocation(expected_errors, "gb|" + id_str + "|");
2704     }
2705     eval = validator.Validate(seh, options);
2706     CheckErrors(*eval, expected_errors);
2707     CLEAR_ERRORS
2708 }
2709 
2710 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_INST_BadSeqIdFormat,CGenBankFixture)2711 BOOST_FIXTURE_TEST_CASE(Test_SEQ_INST_BadSeqIdFormat, CGenBankFixture)
2712 {
2713     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
2714     CRef<CSeq_entry> nuc_entry = entry->SetSet().SetSeq_set().front();
2715     CRef<CSeq_entry> prot_entry = entry->SetSet().SetSeq_set().back();
2716     CRef<CSeq_feat> prot_feat = prot_entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
2717     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
2718 
2719     STANDARD_SETUP
2720 
2721     expected_errors.push_back(new CExpectedError("",eDiag_Error, "BadSeqIdFormat", "Bad accession"));
2722 
2723     vector<string> bad_ids;
2724     bad_ids.push_back("AY123456ABC");  // can't have letters after digits
2725     bad_ids.push_back("A1234");        // for a single letter, only acceptable number of digits is 5
2726     bad_ids.push_back("A123456");
2727     bad_ids.push_back("AY12345");      // for two letters, only acceptable number of digits is 6
2728     bad_ids.push_back("AY1234567");
2729     bad_ids.push_back("ABC1234");      // three letters bad unless prot and 5 digits
2730     bad_ids.push_back("ABC123456");
2731     bad_ids.push_back("ABCD1234567");  // four letters
2732     bad_ids.push_back("ABCDE123456");  // five letters
2733     bad_ids.push_back("ABCDE12345678");
2734 
2735     vector<string> bad_nuc_ids;
2736     bad_nuc_ids.push_back("ABC12345");
2737 
2738     vector<string> bad_prot_ids;
2739     bad_prot_ids.push_back("AY123456");
2740     bad_prot_ids.push_back("A12345");
2741 
2742     vector<string> good_ids;
2743 
2744     vector<string> good_nuc_ids;
2745     good_nuc_ids.push_back("AY123456");
2746     good_nuc_ids.push_back("A12345");
2747     good_nuc_ids.push_back("ABCD123456789");
2748     good_nuc_ids.push_back("ABCD1234567890");
2749 
2750     vector<string> good_prot_ids;
2751     good_prot_ids.push_back("ABC12345");
2752 
2753 
2754     CRef<CSeq_id> good_nuc_id(new CSeq_id());
2755     good_nuc_id->SetLocal().SetStr("nuc");
2756     CRef<CSeq_id> good_prot_id(new CSeq_id());
2757     good_prot_id->SetLocal().SetStr("prot");
2758 
2759     CRef<CSeq_id> bad_id(new CSeq_id());
2760 
2761     // bad for both
2762     for (vector<string>::iterator id_it = bad_ids.begin();
2763         id_it != bad_ids.end();
2764         ++id_it) {
2765         string id_str = *id_it;
2766         string acc_str = "gb|" + id_str + "|";
2767         ChangeErrorAcc(expected_errors, acc_str);
2768         expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2769 
2770         //GenBank
2771         scope.RemoveTopLevelSeqEntry(seh);
2772         scope.ResetDataAndHistory();
2773         bad_id->SetGenbank().SetAccession(id_str);
2774         unit_test_util::ChangeNucId(entry, bad_id);
2775         unit_test_util::ChangeProtId(entry, good_prot_id);
2776         seh = scope.AddTopLevelSeqEntry(*entry);
2777         eval = validator.Validate(seh, options);
2778         CheckErrors(*eval, expected_errors);
2779         scope.RemoveTopLevelSeqEntry(seh);
2780         scope.ResetDataAndHistory();
2781         unit_test_util::ChangeNucId(entry, good_nuc_id);
2782         unit_test_util::ChangeProtId(entry, bad_id);
2783         seh = scope.AddTopLevelSeqEntry(*entry);
2784         eval = validator.Validate(seh, options);
2785         CheckErrors(*eval, expected_errors);
2786 
2787     }
2788 
2789     for (vector<string>::iterator id_it = bad_ids.begin();
2790         id_it != bad_ids.end();
2791         ++id_it) {
2792         string id_str = *id_it;
2793         id_str = "B" + id_str.substr(1);
2794         expected_errors[0]->SetAccession("embl|" + id_str + "|");
2795         expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2796 
2797         // EMBL
2798         scope.RemoveTopLevelSeqEntry(seh);
2799         scope.ResetDataAndHistory();
2800         bad_id->SetEmbl().SetAccession(id_str);
2801         unit_test_util::ChangeNucId(entry, bad_id);
2802         unit_test_util::ChangeProtId(entry, good_prot_id);
2803         seh = scope.AddTopLevelSeqEntry(*entry);
2804         eval = validator.Validate(seh, options);
2805         expected_errors[0]->SetAccession("emb|" + id_str + "|");
2806         CheckErrors (*eval, expected_errors);
2807         scope.RemoveTopLevelSeqEntry(seh);
2808         scope.ResetDataAndHistory();
2809         unit_test_util::ChangeNucId(entry, good_nuc_id);
2810         unit_test_util::ChangeProtId(entry, bad_id);
2811         seh = scope.AddTopLevelSeqEntry(*entry);
2812         eval = validator.Validate(seh, options);
2813         CheckErrors (*eval, expected_errors);
2814 
2815     }
2816 
2817     for (vector<string>::iterator id_it = bad_ids.begin();
2818         id_it != bad_ids.end();
2819         ++id_it) {
2820         string id_str = *id_it;
2821         id_str = "C" + id_str.substr(1);
2822         expected_errors[0]->SetAccession("dbj|" + id_str + "|");
2823         expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2824 
2825         // DDBJ
2826         scope.RemoveTopLevelSeqEntry(seh);
2827         scope.ResetDataAndHistory();
2828         bad_id->SetDdbj().SetAccession(id_str);
2829         unit_test_util::ChangeNucId(entry, bad_id);
2830         unit_test_util::ChangeProtId(entry, good_prot_id);
2831         seh = scope.AddTopLevelSeqEntry(*entry);
2832         eval = validator.Validate(seh, options);
2833         expected_errors[0]->SetAccession("dbj|" + id_str + "|");
2834         CheckErrors (*eval, expected_errors);
2835         scope.RemoveTopLevelSeqEntry(seh);
2836         scope.ResetDataAndHistory();
2837         unit_test_util::ChangeNucId(entry, good_nuc_id);
2838         unit_test_util::ChangeProtId(entry, bad_id);
2839         seh = scope.AddTopLevelSeqEntry(*entry);
2840         eval = validator.Validate(seh, options);
2841         CheckErrors (*eval, expected_errors);
2842 
2843     }
2844 
2845     // bad for just nucs
2846     for (vector<string>::iterator id_it = bad_nuc_ids.begin();
2847          id_it != bad_nuc_ids.end();
2848          ++id_it) {
2849         string id_str = *id_it;
2850         bad_id->SetGenbank().SetAccession(id_str);
2851         scope.RemoveTopLevelSeqEntry(seh);
2852         unit_test_util::ChangeNucId(entry, bad_id);
2853         unit_test_util::ChangeProtId(entry, good_prot_id);
2854         expected_errors[0]->SetAccession("gb|"+id_str+"|");
2855         expected_errors[0]->SetErrMsg("Bad accession " + id_str);
2856         seh = scope.AddTopLevelSeqEntry(*entry);
2857         eval = validator.Validate(seh, options);
2858         CheckErrors (*eval, expected_errors);
2859     }
2860 
2861     // bad for just prots
2862     for (auto id_it : bad_prot_ids) {
2863         TestBadProtId(id_it);
2864     }
2865 
2866     CLEAR_ERRORS
2867 
2868     // good for both
2869     for (vector<string>::iterator id_it = good_ids.begin();
2870          id_it != good_ids.end();
2871          ++id_it) {
2872         string id_str = *id_it;
2873         bad_id->SetGenbank().SetAccession(id_str);
2874         scope.RemoveTopLevelSeqEntry(seh);
2875         unit_test_util::ChangeNucId(entry, bad_id);
2876         unit_test_util::ChangeProtId(entry, good_prot_id);
2877         seh = scope.AddTopLevelSeqEntry(*entry);
2878         eval = validator.Validate(seh, options);
2879         //AddChromosomeNoLocation(expected_errors, "gb|" + *id_it + "|");
2880         CheckErrors (*eval, expected_errors);
2881         scope.RemoveTopLevelSeqEntry(seh);
2882         unit_test_util::ChangeNucId(entry, good_nuc_id);
2883         unit_test_util::ChangeProtId(entry, bad_id);
2884         seh = scope.AddTopLevelSeqEntry(*entry);
2885         eval = validator.Validate(seh, options);
2886         CheckErrors (*eval, expected_errors);
2887         CLEAR_ERRORS
2888     }
2889 
2890     // good for nucs
2891     for (auto id_it : good_nuc_ids) {
2892         TestGoodNucId(id_it);
2893     }
2894 
2895     // good for just prots
2896     for (auto id_it : good_prot_ids) {
2897         TestGoodProtId(id_it);
2898     }
2899 
2900     // if GI, needs version
2901     scope.RemoveTopLevelSeqEntry(seh);
2902     bad_id->SetGenbank().SetAccession("AY123456");
2903     bad_id->SetGenbank().SetVersion(0);
2904     unit_test_util::ChangeNucId(entry, bad_id);
2905     unit_test_util::ChangeProtId(entry, good_prot_id);
2906     CRef<CSeq_id> gi_id(new CSeq_id("gi|21914627"));
2907     nuc_entry->SetSeq().SetId().push_back(gi_id);
2908     seh = scope.AddTopLevelSeqEntry(*entry);
2909     eval = validator.Validate(seh, options);
2910     expected_errors.push_back (new CExpectedError ("gb|AY123456|", eDiag_Critical, "BadSeqIdFormat",
2911                                                    "Accession AY123456 has 0 version"));
2912     expected_errors.push_back (new CExpectedError ("gb|AY123456|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123456|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
2913     //AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
2914     CheckErrors (*eval, expected_errors);
2915 
2916     CLEAR_ERRORS
2917 
2918     nuc_entry->SetSeq().SetId().pop_back();
2919 
2920     // id that is too long
2921     scope.RemoveTopLevelSeqEntry(seh);
2922     bad_id->SetLocal().SetStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234");
2923     unit_test_util::ChangeNucId(entry, bad_id);
2924     seh = scope.AddTopLevelSeqEntry(*entry);
2925     eval = validator.Validate(seh, options);
2926     //AddChromosomeNoLocation(expected_errors, "lcl|ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234");
2927     CheckErrors (*eval, expected_errors);
2928 
2929     CLEAR_ERRORS
2930 
2931     // shouldn't report if ncbifile ID
2932     scope.RemoveTopLevelSeqEntry(seh);
2933     CRef<CSeq_id> ncbifile(new CSeq_id("gnl|NCBIFILE|ABCDEFGHIJKLMNOPQRSTUVWXYZ012345678901234"));
2934     unit_test_util::ChangeNucId(entry, good_nuc_id);
2935     nuc_entry->SetSeq().SetId().push_back(ncbifile);
2936     seh = scope.AddTopLevelSeqEntry(*entry);
2937     eval = validator.Validate(seh, options);
2938     //AddChromosomeNoLocation(expected_errors, entry);
2939     CheckErrors (*eval, expected_errors);
2940     nuc_entry->SetSeq().SetId().pop_back();
2941     CLEAR_ERRORS
2942 
2943     // report if database name len too long
2944     scope.RemoveTopLevelSeqEntry(seh);
2945     entry = unit_test_util::BuildGoodSeq();
2946     CRef<CSeq_id> general(new CSeq_id());
2947     general->SetGeneral().SetDb("thisdatabasevalueislong");
2948     general->SetGeneral().SetTag().SetStr("b");
2949     entry->SetSeq().ResetId();
2950     entry->SetSeq().SetId().push_back(general);
2951     seh = scope.AddTopLevelSeqEntry(*entry);
2952     expected_errors.push_back (new CExpectedError ("gnl|thisdatabasevalueislong|b", eDiag_Critical, "BadSeqIdFormat",
2953                                                    "General database longer than 20 characters"));
2954 
2955     //AddChromosomeNoLocation(expected_errors, "gnl|thisdatabasevalueislong|b");
2956     eval = validator.Validate(seh, options);
2957     CheckErrors (*eval, expected_errors);
2958 
2959     CLEAR_ERRORS
2960 
2961     // do not report forward slash
2962     scope.RemoveTopLevelSeqEntry(seh);
2963     entry = unit_test_util::BuildGoodSeq();
2964     entry->SetSeq().SetId().front()->SetLocal().SetStr("a/b");
2965     seh = scope.AddTopLevelSeqEntry(*entry);
2966     eval = validator.Validate(seh, options);
2967     //AddChromosomeNoLocation(expected_errors, "lcl|a/b");
2968     CheckErrors (*eval, expected_errors);
2969 
2970     CLEAR_ERRORS
2971 }
2972 
2973 
TestOneGeneralSeqId(const string & db,const string & tag,const string & errmsg)2974 void TestOneGeneralSeqId(const string& db, const string& tag, const string& errmsg)
2975 {
2976     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
2977     CRef<CSeq_id> id(new CSeq_id());
2978     id->SetGeneral().SetDb(db);
2979     id->SetGeneral().SetTag().SetStr(tag);
2980     entry->SetSeq().SetId().push_back(id);
2981 
2982     STANDARD_SETUP
2983 
2984     string acc_str = "lcl|good";
2985     if (!errmsg.empty()) {
2986         expected_errors.push_back(new CExpectedError(acc_str, eDiag_Warning, "BadSeqIdFormat",
2987             errmsg));
2988     }
2989     //AddChromosomeNoLocation(expected_errors, entry);
2990     eval = validator.Validate(seh, options);
2991     CheckErrors(*eval, expected_errors);
2992 
2993     CLEAR_ERRORS
2994 }
2995 
2996 
BOOST_AUTO_TEST_CASE(Test_VR_748)2997 BOOST_AUTO_TEST_CASE(Test_VR_748)
2998 {
2999     TestOneGeneralSeqId("PRJNA318798", " CpPA02_0001", "Bad character ' ' in sequence ID 'gnl|PRJNA318798| CpPA02_0001'");
3000     TestOneGeneralSeqId("PRJNA3 18798", "CpPA02_0001", "Bad character ' ' in sequence ID 'gnl|PRJNA3 18798|CpPA02_0001'");
3001 }
3002 
3003 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_BadSecondaryAccn)3004 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_BadSecondaryAccn)
3005 {
3006     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
3007     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3008 
3009     STANDARD_SETUP
3010 
3011     CRef<CSeqdesc> gbdesc (new CSeqdesc());
3012     gbdesc->SetGenbank().SetExtra_accessions().push_back("AY123456");
3013     entry->SetSeq().SetDescr().Set().push_back(gbdesc);
3014 
3015     expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Error, "BadSecondaryAccn", "AY123456 used for both primary and secondary accession"));
3016     //AddChromosomeNoLocation(expected_errors, "gb|AY123456|");
3017     eval = validator.Validate(seh, options);
3018     CheckErrors (*eval, expected_errors);
3019 
3020     gbdesc->SetEmbl().SetExtra_acc().push_back("AY123456");
3021     eval = validator.Validate(seh, options);
3022     CheckErrors (*eval, expected_errors);
3023 
3024     CLEAR_ERRORS
3025 }
3026 
3027 
BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ZeroGiNumber)3028 BOOST_AUTO_TEST_CASE(Test_SEQ_INST_ZeroGiNumber)
3029 {
3030     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
3031     entry->SetSeq().SetId().front()->SetGi(ZERO_GI);
3032 
3033     STANDARD_SETUP
3034 
3035     expected_errors.push_back(new CExpectedError("gi|0", eDiag_Critical, "ZeroGiNumber", "Invalid GI number"));
3036     expected_errors.push_back(new CExpectedError("gi|0", eDiag_Error, "GiWithoutAccession", "No accession on sequence with gi number"));
3037     //AddChromosomeNoLocation(expected_errors, "gi|0");
3038     eval = validator.Validate(seh, options);
3039     CheckErrors (*eval, expected_errors);
3040 
3041     CLEAR_ERRORS
3042 }
3043 
3044 
BOOST_AUTO_TEST_CASE(Test_HistoryGiCollision)3045 BOOST_AUTO_TEST_CASE(Test_HistoryGiCollision)
3046 {
3047     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
3048     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3049     entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3050     CRef<CSeq_id> gi_id(new CSeq_id());
3051     gi_id->SetGi(GI_CONST(21914627));
3052     entry->SetSeq().SetId().push_back(gi_id);
3053 
3054     STANDARD_SETUP
3055 
3056     CRef<CSeq_id> hist_id(new CSeq_id());
3057     hist_id->SetGi(GI_CONST(21914627));
3058     entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetIds().push_back(hist_id);
3059     entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetDate().SetStd().SetYear(2008);
3060 
3061     expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error, "HistoryGiCollision", "Replaced by gi (21914627) is same as current Bioseq"));
3062     //AddChromosomeNoLocation(expected_errors, "gb|AY123456.1|");
3063     eval = validator.Validate(seh, options);
3064     CheckErrors (*eval, expected_errors);
3065 
3066     entry->SetSeq().SetInst().SetHist().ResetReplaced_by();
3067     entry->SetSeq().SetInst().SetHist().SetReplaces().SetIds().push_back(hist_id);
3068     entry->SetSeq().SetInst().SetHist().SetReplaces().SetDate().SetStd().SetYear(2008);
3069     expected_errors[0]->SetErrMsg("Replaces gi (21914627) is same as current Bioseq");
3070     eval = validator.Validate(seh, options);
3071     CheckErrors (*eval, expected_errors);
3072 
3073     CLEAR_ERRORS
3074 
3075     // should not generate errors if date has not been set
3076     entry->SetSeq().SetInst().SetHist().ResetReplaces();
3077     entry->SetSeq().SetInst().SetHist().SetReplaced_by().SetIds().push_back(hist_id);
3078     eval = validator.Validate(seh, options);
3079     //AddChromosomeNoLocation(expected_errors, entry);
3080     CheckErrors (*eval, expected_errors);
3081 
3082     entry->SetSeq().SetInst().SetHist().ResetReplaced_by();
3083     entry->SetSeq().SetInst().SetHist().SetReplaces().SetIds().push_back(hist_id);
3084     eval = validator.Validate(seh, options);
3085     CheckErrors (*eval, expected_errors);
3086 
3087     CLEAR_ERRORS
3088 }
3089 
3090 
BOOST_AUTO_TEST_CASE(Test_GiWithoutAccession)3091 BOOST_AUTO_TEST_CASE(Test_GiWithoutAccession)
3092 {
3093     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
3094     entry->SetSeq().SetId().front()->SetGi(GI_CONST(123456));
3095 
3096     STANDARD_SETUP
3097 
3098     expected_errors.push_back(new CExpectedError("gi|123456", eDiag_Error, "GiWithoutAccession", "No accession on sequence with gi number"));
3099     //AddChromosomeNoLocation(expected_errors, entry);
3100     eval = validator.Validate(seh, options);
3101     CheckErrors (*eval, expected_errors);
3102 
3103     CLEAR_ERRORS
3104 }
3105 
3106 
TestOneOtherAcc(CRef<CSeq_id> other_acc,bool id_change,bool conflict,bool need_hist=false)3107 void TestOneOtherAcc(CRef<CSeq_id> other_acc, bool id_change, bool conflict, bool need_hist = false)
3108 {
3109     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
3110     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3111     entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3112     CRef<CSeq_id> gi_id(new CSeq_id());
3113     gi_id->SetGi(GI_CONST(21914627));
3114     entry->SetSeq().SetId().push_back(gi_id);
3115     entry->SetSeq().SetId().push_back(other_acc);
3116     string acc_str = "gb|AY123456.1|";
3117 
3118     STANDARD_SETUP
3119 
3120     if (conflict) {
3121         expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error, "ConflictingIdsOnBioseq",
3122             "Conflicting ids on a Bioseq: (gb|AY123456.1| - " + other_acc->AsFastaString() + ")"));
3123     }
3124     expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "MultipleAccessions", "Multiple accessions on sequence with gi number"));
3125     if (id_change) {
3126         expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123457.1|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
3127     }
3128     if (need_hist) {
3129         expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Info, "HistAssemblyMissing",
3130             "TPA record gb|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3131     }
3132     //AddChromosomeNoLocation(expected_errors, acc_str);
3133     eval = validator.Validate(seh, options);
3134     CheckErrors(*eval, expected_errors);
3135 
3136     CLEAR_ERRORS
3137 }
3138 
3139 
BOOST_FIXTURE_TEST_CASE(Test_MultipleAccessions,CGenBankFixture)3140 BOOST_FIXTURE_TEST_CASE(Test_MultipleAccessions, CGenBankFixture)
3141 {
3142     CRef<CSeq_id> other_acc(new CSeq_id());
3143 
3144     // genbank, ddbj, embl, tpg, tpe, tpd, other, pir, swissprot, and prf all count as accessionts
3145     // genbank
3146     other_acc->SetGenbank().SetAccession("AY123457");
3147     other_acc->SetGenbank().SetVersion(1);
3148     TestOneOtherAcc(other_acc, true, true);
3149 
3150     // ddbj
3151     other_acc->SetDdbj().SetAccession("AY123457");
3152     other_acc->SetDdbj().SetVersion(1);
3153     TestOneOtherAcc(other_acc, false, true);
3154 
3155     // embl
3156     other_acc->SetEmbl().SetAccession("AY123457");
3157     other_acc->SetEmbl().SetVersion(1);
3158     TestOneOtherAcc(other_acc, false, true);
3159 
3160     // pir
3161     other_acc->SetPir().SetAccession("AY123457");
3162     other_acc->SetPir().SetVersion(1);
3163     TestOneOtherAcc(other_acc, false, false);
3164 
3165     // swissprot
3166     other_acc->SetSwissprot().SetAccession("AY123457");
3167     other_acc->SetSwissprot().SetVersion(1);
3168     TestOneOtherAcc(other_acc, false, false);
3169 
3170     // prf
3171     other_acc->SetPrf().SetAccession("AY123457");
3172     other_acc->SetPrf().SetVersion(1);
3173     TestOneOtherAcc(other_acc, false, false);
3174 
3175     // tpg
3176     other_acc->SetTpg().SetAccession("AY123457");
3177     other_acc->SetTpg().SetVersion(1);
3178     TestOneOtherAcc(other_acc, false, true, true);
3179 
3180     // tpe
3181     other_acc->SetTpe().SetAccession("AY123457");
3182     other_acc->SetTpe().SetVersion(1);
3183     TestOneOtherAcc(other_acc, false, true, true);
3184 
3185     // tpd
3186     other_acc->SetTpd().SetAccession("AY123457");
3187     other_acc->SetTpd().SetVersion(1);
3188     TestOneOtherAcc(other_acc, false, true, true);
3189 
3190     // other
3191     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
3192     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3193     entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3194     CRef<CSeq_id> gi_id(new CSeq_id());
3195     gi_id->SetGi(GI_CONST(21914627));
3196     entry->SetSeq().SetId().push_back(gi_id);
3197     entry->SetSeq().SetId().push_back(other_acc);
3198     other_acc->SetOther().SetAccession("NC_123457");
3199     other_acc->SetOther().SetVersion(1);
3200 
3201     STANDARD_SETUP
3202 
3203     string acc_str = "gb|AY123456.1|";
3204     expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "INSDRefSeqPackaging", "INSD and RefSeq records should not be present in the same set"));
3205     expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "MultipleAccessions", "Multiple accessions on sequence with gi number"));
3206     //AddChromosomeNoLocation(expected_errors, acc_str);
3207     eval = validator.Validate(seh, options);
3208     CheckErrors (*eval, expected_errors);
3209 
3210     CLEAR_ERRORS
3211 }
3212 
3213 
BOOST_AUTO_TEST_CASE(Test_HistAssemblyMissing)3214 BOOST_AUTO_TEST_CASE(Test_HistAssemblyMissing)
3215 {
3216     CRef<CSeq_entry> tpg_entry = unit_test_util::BuildGoodSeq();
3217     tpg_entry->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3218     tpg_entry->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3219 
3220     CRef<CSeq_entry> tpe_entry = unit_test_util::BuildGoodSeq();
3221     tpe_entry->SetSeq().SetId().front()->SetTpe().SetAccession("AY123456");
3222     tpe_entry->SetSeq().SetId().front()->SetTpe().SetVersion(1);
3223 
3224     CRef<CSeq_entry> tpd_entry = unit_test_util::BuildGoodSeq();
3225     tpd_entry->SetSeq().SetId().front()->SetTpd().SetAccession("AY123456");
3226     tpd_entry->SetSeq().SetId().front()->SetTpd().SetVersion(1);
3227 
3228     STANDARD_SETUP_NAME(tpg_entry)
3229 
3230     // tpg
3231     expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Info, "HistAssemblyMissing", "TPA record tpg|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3232     //AddChromosomeNoLocation(expected_errors, tpg_entry);
3233     eval = validator.Validate(seh, options);
3234     CheckErrors (*eval, expected_errors);
3235 
3236     // tpe
3237     scope.RemoveTopLevelSeqEntry(seh);
3238     seh = scope.AddTopLevelSeqEntry(*tpe_entry);
3239     ChangeErrorAcc(expected_errors, "tpe|AY123456.1|");
3240     expected_errors[0]->SetErrMsg("TPA record tpe|AY123456.1| should have Seq-hist.assembly for PRIMARY block");
3241     eval = validator.Validate(seh, options);
3242     CheckErrors (*eval, expected_errors);
3243 
3244 
3245     // tpd
3246     scope.RemoveTopLevelSeqEntry(seh);
3247     seh = scope.AddTopLevelSeqEntry(*tpd_entry);
3248     ChangeErrorAcc(expected_errors, "tpd|AY123456.1|");
3249     expected_errors[0]->SetErrMsg("TPA record tpd|AY123456.1| should have Seq-hist.assembly for PRIMARY block");
3250     eval = validator.Validate(seh, options);
3251     CheckErrors (*eval, expected_errors);
3252 
3253     CLEAR_ERRORS
3254 
3255     // error suppressed if keyword present
3256     CRef<CSeqdesc> block(new CSeqdesc());
3257     block->SetGenbank().SetKeywords().push_back("TPA:reassembly");
3258     tpg_entry->SetSeq().SetDescr().Set().push_back(block);
3259     scope.RemoveTopLevelSeqEntry(seh);
3260     seh = scope.AddTopLevelSeqEntry(*tpg_entry);
3261     eval = validator.Validate(seh, options);
3262     //AddChromosomeNoLocation(expected_errors, tpg_entry);
3263 
3264     CheckErrors (*eval, expected_errors);
3265     block->SetEmbl().SetKeywords().push_back("TPA:reassembly");
3266     eval = validator.Validate(seh, options);
3267     CheckErrors (*eval, expected_errors);
3268     CLEAR_ERRORS
3269 }
3270 
3271 
BOOST_AUTO_TEST_CASE(Test_TerminalNs)3272 BOOST_AUTO_TEST_CASE(Test_TerminalNs)
3273 {
3274     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
3275     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("NNNNNNNNNNAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCCAANNNNNNNNNN");
3276     entry->SetSeq().SetInst().SetLength(62);
3277 
3278     STANDARD_SETUP
3279 
3280     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at beginning of sequence"));
3281     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at end of sequence"));
3282     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
3283         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3284     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
3285         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3286     //AddChromosomeNoLocation(expected_errors, entry);
3287     eval = validator.Validate(seh, options);
3288     CheckErrors (*eval, expected_errors);
3289 
3290     // warning level changes if not local only
3291     scope.RemoveTopLevelSeqEntry(seh);
3292     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3293     seh = scope.AddTopLevelSeqEntry(*entry);
3294     ChangeErrorAcc(expected_errors, "gb|AY123456|");
3295     expected_errors[0]->SetSeverity(eDiag_Error);
3296     expected_errors[1]->SetSeverity(eDiag_Error);
3297     eval = validator.Validate(seh, options);
3298     CheckErrors (*eval, expected_errors);
3299 
3300     CLEAR_ERRORS
3301 
3302     // also try delta sequence
3303     scope.RemoveTopLevelSeqEntry(seh);
3304     entry = unit_test_util::BuildGoodDeltaSeq ();
3305     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNNNNNNNCCC");
3306     entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("CCCNNNNNNNNN");
3307     seh = scope.AddTopLevelSeqEntry(*entry);
3308     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at beginning of sequence"));
3309     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalNs", "N at end of sequence"));
3310     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 52 percent Ns"));
3311     eval = validator.Validate(seh, options);
3312     CheckErrors (*eval, expected_errors);
3313 
3314     // 10 Ns but just local stays at warning
3315     scope.RemoveTopLevelSeqEntry(seh);
3316     entry = unit_test_util::BuildGoodDeltaSeq ();
3317     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNNNNNNNNCC");
3318     entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("CCNNNNNNNNNN");
3319     seh = scope.AddTopLevelSeqEntry(*entry);
3320     expected_errors[2]->SetErrMsg ("Sequence contains 58 percent Ns");
3321     eval = validator.Validate(seh, options);
3322     CheckErrors (*eval, expected_errors);
3323 
3324     // 10 Ns but now has non-local ID, error
3325     scope.RemoveTopLevelSeqEntry(seh);
3326     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3327     seh = scope.AddTopLevelSeqEntry(*entry);
3328     ChangeErrorAcc(expected_errors, "gb|AY123456|");
3329     expected_errors[0]->SetSeverity(eDiag_Error);
3330     expected_errors[1]->SetSeverity(eDiag_Error);
3331     eval = validator.Validate(seh, options);
3332     CheckErrors (*eval, expected_errors);
3333 
3334     // NC and patent IDs back to warning
3335     scope.RemoveTopLevelSeqEntry(seh);
3336     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
3337     seh = scope.AddTopLevelSeqEntry(*entry);
3338     ChangeErrorAcc(expected_errors, "ref|NC_123456|");
3339     expected_errors[0]->SetSeverity(eDiag_Warning);
3340     expected_errors[1]->SetSeverity(eDiag_Warning);
3341     eval = validator.Validate(seh, options);
3342     CheckErrors (*eval, expected_errors);
3343 
3344     scope.RemoveTopLevelSeqEntry(seh);
3345     entry->SetSeq().SetId().front()->SetPatent().SetSeqid(1);
3346     entry->SetSeq().SetId().front()->SetPatent().SetCit().SetCountry("USA");
3347     entry->SetSeq().SetId().front()->SetPatent().SetCit().SetId().SetNumber("1");
3348     seh = scope.AddTopLevelSeqEntry(*entry);
3349     ChangeErrorAcc(expected_errors, "pat|USA|1|1");
3350     delete expected_errors[2];
3351     expected_errors.pop_back();
3352     eval = validator.Validate(seh, options);
3353     CheckErrors (*eval, expected_errors);
3354 
3355     CLEAR_ERRORS
3356 
3357     // no more TerminalNs warnings if circular
3358     entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
3359     unit_test_util::SetCompleteness(entry, CMolInfo::eCompleteness_complete);
3360     expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Warning, "UnwantedCompleteFlag",
3361                               "Suspicious use of complete"));
3362     //AddChromosomeNoLocation(expected_errors, entry);
3363 
3364     eval = validator.Validate(seh, options);
3365     CheckErrors (*eval, expected_errors);
3366 
3367     CLEAR_ERRORS
3368 }
3369 
3370 
BOOST_FIXTURE_TEST_CASE(Test_UnexpectedIdentifierChange,CGenBankFixture)3371 BOOST_FIXTURE_TEST_CASE(Test_UnexpectedIdentifierChange, CGenBankFixture)
3372 {
3373     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
3374     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123457");
3375     entry->SetSeq().SetId().front()->SetGenbank().SetVersion(1);
3376     CRef<CSeq_id> gi_id(new CSeq_id());
3377     gi_id->SetGi(GI_CONST(21914627));
3378     entry->SetSeq().SetId().push_back(gi_id);
3379 
3380     STANDARD_SETUP
3381 
3382     expected_errors.push_back(new CExpectedError("gb|AY123457.1|", eDiag_Warning, "UnexpectedIdentifierChange", "New accession (gb|AY123457.1|) does not match one in NCBI sequence repository (gb|AY123456.1|) on gi (21914627)"));
3383     //AddChromosomeNoLocation(expected_errors, entry);
3384     eval = validator.Validate(seh, options);
3385     CheckErrors (*eval, expected_errors);
3386 
3387     CLEAR_ERRORS
3388     scope.RemoveTopLevelSeqEntry(seh);
3389     entry->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3390     entry->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3391     seh = scope.AddTopLevelSeqEntry(*entry);
3392     //AddChromosomeNoLocation(expected_errors, entry);
3393     expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Info, "HistAssemblyMissing", "TPA record tpg|AY123456.1| should have Seq-hist.assembly for PRIMARY block"));
3394     expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "Loss of accession (gb|AY123456.1|) on gi (21914627) compared to the NCBI sequence repository"));
3395     eval = validator.Validate(seh, options);
3396     CheckErrors (*eval, expected_errors);
3397 
3398     // TODO - try to instigate other errors
3399 
3400     CLEAR_ERRORS
3401 }
3402 
3403 
BOOST_AUTO_TEST_CASE(Test_InternalNsInSeqLit)3404 BOOST_AUTO_TEST_CASE(Test_InternalNsInSeqLit)
3405 {
3406     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
3407     unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNGG");
3408     SetTech(entry, CMolInfo::eTech_wgs);
3409 
3410     STANDARD_SETUP
3411 
3412     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqLit", "Run of 20 Ns in delta component 5 that starts at base 45"));
3413     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WGSseqGapProblem", "WGS submission includes wrong gap type. Gaps for WGS genomes should be Assembly Gaps with linkage evidence."));
3414     /*
3415     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3416         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3417     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3418         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3419     */
3420     AddChromosomeNoLocation(expected_errors, entry);
3421 
3422     eval = validator.Validate(seh, options);
3423     CheckErrors (*eval, expected_errors);
3424 
3425     CLEAR_ERRORS
3426 
3427     unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGG");
3428     SetTech(entry, CMolInfo::eTech_htgs_1);
3429     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqLit",
3430         "Run of 81 Ns in delta component 7 that starts at base 79"));
3431     /*
3432     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3433         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3434     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3435         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3436     */
3437     //AddChromosomeNoLocation(expected_errors, entry);
3438 
3439     eval = validator.Validate(seh, options);
3440     CheckErrors (*eval, expected_errors);
3441 
3442     SetTech(entry, CMolInfo::eTech_htgs_2);
3443     eval = validator.Validate(seh, options);
3444     CheckErrors (*eval, expected_errors);
3445 
3446     SetTech(entry, CMolInfo::eTech_composite_wgs_htgs);
3447     eval = validator.Validate(seh, options);
3448     CheckErrors (*eval, expected_errors);
3449 
3450     unit_test_util::AddToDeltaSeq(entry, "AANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGG");
3451     SetTech(entry, CMolInfo::eTech_unknown);
3452     expected_errors[0]->SetErrMsg("Run of 101 Ns in delta component 9 that starts at base 174");
3453     eval = validator.Validate(seh, options);
3454     CheckErrors (*eval, expected_errors);
3455 
3456     CLEAR_ERRORS
3457 }
3458 
3459 
BOOST_AUTO_TEST_CASE(Test_SeqLitGapLength0)3460 BOOST_AUTO_TEST_CASE(Test_SeqLitGapLength0)
3461 {
3462     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
3463     CRef<CDelta_seq> delta_seq(new CDelta_seq());
3464     delta_seq->SetLiteral().SetLength(0);
3465     entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(delta_seq);
3466 
3467     STANDARD_SETUP
3468 
3469     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLitGapLength0", "Gap of length 0 in delta chain"));
3470     // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "Last delta seq component is a gap"));
3471     //AddChromosomeNoLocation(expected_errors, entry);
3472     eval = validator.Validate(seh, options);
3473     CheckErrors (*eval, expected_errors);
3474 
3475     // some kinds of fuzz don't trigger other kind of error
3476     delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_gt);
3477     eval = validator.Validate(seh, options);
3478     CheckErrors (*eval, expected_errors);
3479 
3480     delta_seq->SetLiteral().SetFuzz().Reset();
3481     delta_seq->SetLiteral().SetFuzz().SetP_m(10);
3482     eval = validator.Validate(seh, options);
3483     CheckErrors (*eval, expected_errors);
3484 
3485     // others will
3486     delta_seq->SetLiteral().SetFuzz().Reset();
3487     delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
3488     expected_errors[0]->SetErrMsg("Gap of length 0 with unknown fuzz in delta chain");
3489     eval = validator.Validate(seh, options);
3490     CheckErrors (*eval, expected_errors);
3491 
3492     // try again with swissprot, error goes to warning
3493     scope.RemoveTopLevelSeqEntry(seh);
3494     entry->SetSeq().SetId().front()->SetSwissprot().SetAccession("AY123456");
3495     seh = scope.AddTopLevelSeqEntry(*entry);
3496     expected_errors[0]->SetSeverity(eDiag_Warning);
3497     ChangeErrorAcc(expected_errors, "sp|AY123456|");
3498     eval = validator.Validate(seh, options);
3499     CheckErrors (*eval, expected_errors);
3500 
3501     delta_seq->SetLiteral().SetFuzz().SetP_m(10);
3502     expected_errors[0]->SetErrMsg("Gap of length 0 in delta chain");
3503     eval = validator.Validate(seh, options);
3504     CheckErrors (*eval, expected_errors);
3505 
3506     delta_seq->SetLiteral().SetFuzz().Reset();
3507     delta_seq->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_gt);
3508     eval = validator.Validate(seh, options);
3509     CheckErrors (*eval, expected_errors);
3510 
3511     delta_seq->SetLiteral().ResetFuzz();
3512     eval = validator.Validate(seh, options);
3513     CheckErrors (*eval, expected_errors);
3514 
3515     CLEAR_ERRORS
3516 }
3517 
3518 
AddTpaAssemblyUserObject(CRef<CSeq_entry> entry)3519 static void AddTpaAssemblyUserObject(CRef<CSeq_entry> entry)
3520 {
3521     CRef<CSeqdesc> desc(new CSeqdesc());
3522     desc->SetUser().SetType().SetStr("TpaAssembly");
3523     entry->SetSeq().SetDescr().Set().push_back(desc);
3524 
3525     CRef<CUser_field> field(new CUser_field());
3526     field->SetLabel().SetStr("Label");
3527     field->SetData().SetStr("Data");
3528     desc->SetUser().SetData().push_back(field);
3529 }
3530 
3531 
BOOST_FIXTURE_TEST_CASE(Test_TpaAssemblyProblem,CGenBankFixture)3532 BOOST_FIXTURE_TEST_CASE(Test_TpaAssemblyProblem, CGenBankFixture)
3533 {
3534     CRef<CSeq_entry> entry(new CSeq_entry());
3535     entry->SetSet().SetClass(CBioseq_set::eClass_genbank);
3536     CRef<CSeq_entry> member1 = unit_test_util::BuildGoodSeq();
3537     member1->SetSeq().SetId().front()->SetLocal().SetStr("good");
3538     AddTpaAssemblyUserObject(member1);
3539     entry->SetSet().SetSeq_set().push_back(member1);
3540     CRef<CSeq_entry> member2 = unit_test_util::BuildGoodSeq();
3541     member2->SetSeq().SetId().front()->SetLocal().SetStr("good2");
3542     AddTpaAssemblyUserObject(member2);
3543     entry->SetSet().SetSeq_set().push_back(member2);
3544 
3545     STANDARD_SETUP
3546 
3547     // two Tpa sequences, but neither has assembly and neither has GI, so no errors expected
3548     //AddChromosomeNoLocation(expected_errors, "lcl|good");
3549     //AddChromosomeNoLocation(expected_errors, "lcl|good2");
3550     eval = validator.Validate(seh, options);
3551     CheckErrors (*eval, expected_errors);
3552 
3553     // now one has hist, other does not
3554     member1->SetSeq().SetInst().SetHist().SetAssembly().push_back(unit_test_util::BuildGoodAlign());
3555     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TpaAssemblyProblem", "There are 1 TPAs with history and 1 without history in this record."));
3556     eval = validator.Validate(seh, options);
3557     CheckErrors (*eval, expected_errors);
3558 
3559     // now one has gi
3560     scope.RemoveTopLevelSeqEntry(seh);
3561     member1->SetSeq().SetId().front()->SetTpg().SetAccession("AY123456");
3562     member1->SetSeq().SetId().front()->SetTpg().SetVersion(1);
3563     CRef<CSeq_id> gi_id(new CSeq_id());
3564     gi_id->SetGi(GI_CONST(21914627));
3565     member1->SetSeq().SetId().push_back(gi_id);
3566     seh = scope.AddTopLevelSeqEntry(*entry);
3567 
3568     CLEAR_ERRORS
3569 
3570     expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "UnexpectedIdentifierChange", "Loss of accession (gb|AY123456.1|) on gi (21914627) compared to the NCBI sequence repository"));
3571     expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Error, "TpaAssemblyProblem", "There are 1 TPAs with history and 1 without history in this record."));
3572     expected_errors.push_back(new CExpectedError("tpg|AY123456.1|", eDiag_Warning, "TpaAssemblyProblem", "There are 1 TPAs without history in this record, but the record has a gi number assignment."));
3573     //AddChromosomeNoLocation(expected_errors, "tpg|AY123456.1|");
3574     //AddChromosomeNoLocation(expected_errors, "lcl|good2");
3575     eval = validator.Validate(seh, options);
3576     CheckErrors (*eval, expected_errors);
3577 
3578     CLEAR_ERRORS
3579 }
3580 
3581 
BOOST_FIXTURE_TEST_CASE(Test_SeqLocLength,CGenBankFixture)3582 BOOST_FIXTURE_TEST_CASE(Test_SeqLocLength, CGenBankFixture)
3583 {
3584     // prepare entry
3585     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
3586     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGenbank().SetAccession("AY123456");
3587     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
3588     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(9);
3589     entry->SetSeq().SetInst().SetLength(32);
3590 
3591     STANDARD_SETUP
3592 
3593     // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SeqLocLength", "Short length (10) on seq-loc (gb|AY123456|:1-10) of delta seq_ext"));
3594     //AddChromosomeNoLocation(expected_errors, entry);
3595     eval = validator.Validate(seh, options);
3596     CheckErrors (*eval, expected_errors);
3597 
3598     scope.RemoveTopLevelSeqEntry(seh);
3599     // if length 11, should not be a problem
3600     entry = unit_test_util::BuildGoodDeltaSeq();
3601     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGenbank().SetAccession("AY123456");
3602     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
3603     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(10);
3604     entry->SetSeq().SetInst().SetLength(33);
3605     seh = scope.AddTopLevelSeqEntry(*entry);
3606     eval = validator.Validate(seh, options);
3607     CheckErrors (*eval, expected_errors);
3608 
3609     CLEAR_ERRORS
3610 }
3611 
3612 
BOOST_AUTO_TEST_CASE(Test_MissingGaps)3613 BOOST_AUTO_TEST_CASE(Test_MissingGaps)
3614 {
3615     // prepare entry
3616     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
3617     // remove gaps
3618     unit_test_util::RemoveDeltaSeqGaps (entry);
3619 
3620     STANDARD_SETUP
3621 
3622     //AddChromosomeNoLocation(expected_errors, entry);
3623     // only report errors for specific molinfo tech values
3624     eval = validator.Validate(seh, options);
3625     CheckErrors (*eval, expected_errors);
3626     // htgs_3 should not report
3627     SetTech(entry, CMolInfo::eTech_htgs_3);
3628     eval = validator.Validate(seh, options);
3629     CheckErrors (*eval, expected_errors);
3630 
3631     SetTech(entry, CMolInfo::eTech_htgs_0);
3632     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingGaps", "HTGS delta seq should have gaps between all sequence runs"));
3633     eval = validator.Validate(seh, options);
3634     CheckErrors (*eval, expected_errors);
3635 
3636     SetTech(entry, CMolInfo::eTech_htgs_1);
3637     eval = validator.Validate(seh, options);
3638     CheckErrors (*eval, expected_errors);
3639 
3640     SetTech(entry, CMolInfo::eTech_htgs_2);
3641     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 delta seq has no gaps and no graphs"));
3642     eval = validator.Validate(seh, options);
3643     CheckErrors (*eval, expected_errors);
3644 
3645     // RefGeneTracking changes severity
3646     scope.RemoveTopLevelSeqEntry(seh);
3647     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
3648     AddRefGeneTrackingUserObject(entry);
3649     seh = scope.AddTopLevelSeqEntry(*entry);
3650     expected_errors[0]->SetSeverity(eDiag_Info);
3651     ChangeErrorAcc(expected_errors, "ref|NC_123456|");
3652     eval = validator.Validate(seh, options);
3653     CheckErrors (*eval, expected_errors);
3654     delete expected_errors[1];
3655     expected_errors.pop_back();
3656 
3657     SetTech(entry, CMolInfo::eTech_htgs_1);
3658     eval = validator.Validate(seh, options);
3659     CheckErrors (*eval, expected_errors);
3660 
3661     SetTech(entry, CMolInfo::eTech_htgs_0);
3662     eval = validator.Validate(seh, options);
3663     CheckErrors (*eval, expected_errors);
3664 
3665     CLEAR_ERRORS
3666 }
3667 
3668 
BOOST_AUTO_TEST_CASE(Test_CompleteTitleProblem)3669 BOOST_AUTO_TEST_CASE(Test_CompleteTitleProblem)
3670 {
3671     // prepare entry
3672     CRef<CSeq_entry> entry = BuildGoodSeq();
3673     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3674     SetLineage (entry, "Viruses; foo");
3675     SetTitle(entry, "Foo complete genome");
3676 
3677     STANDARD_SETUP
3678 
3679     expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning, "CompleteTitleProblem", "Complete genome in title without complete flag set"));
3680     //AddChromosomeNoLocation(expected_errors, entry);
3681 
3682     eval = validator.Validate(seh, options);
3683     CheckErrors (*eval, expected_errors);
3684 
3685     CLEAR_ERRORS
3686 
3687     // should be no error if complete
3688     SetCompleteness(entry, CMolInfo::eCompleteness_complete);
3689 
3690     eval = validator.Validate(seh, options);
3691     //AddChromosomeNoLocation(expected_errors, entry);
3692     CheckErrors (*eval, expected_errors);
3693 
3694     // different message and code if gaps
3695     scope.RemoveTopLevelSeqEntry(seh);
3696     entry = BuildGoodDeltaSeq();
3697     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3698     unit_test_util::SetLineage (entry, "Viruses; foo");
3699     SetTitle(entry, "Foo complete genome");
3700     SetCompleteness(entry, CMolInfo::eCompleteness_complete);
3701     seh = scope.AddTopLevelSeqEntry(*entry);
3702 
3703     expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3704         "CompleteGenomeHasGaps", "Title contains 'complete genome' but sequence has gaps"));
3705 
3706     eval = validator.Validate(seh, options);
3707     CheckErrors (*eval, expected_errors);
3708 
3709     CLEAR_ERRORS
3710 
3711 }
3712 
3713 
BOOST_AUTO_TEST_CASE(Test_CompleteCircleProblem)3714 BOOST_AUTO_TEST_CASE(Test_CompleteCircleProblem)
3715 {
3716     // prepare entry
3717     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
3718     entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
3719 
3720     STANDARD_SETUP
3721 
3722     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
3723                               "CompleteCircleProblem",
3724                               "Circular topology without complete flag set"));
3725     //AddChromosomeNoLocation(expected_errors, entry);
3726 
3727     eval = validator.Validate(seh, options);
3728     CheckErrors (*eval, expected_errors);
3729 
3730     CLEAR_ERRORS
3731 
3732     scope.RemoveTopLevelSeqEntry(seh);
3733     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
3734     SetTitle(entry, "This is just a title");
3735     unit_test_util::SetCompleteness(entry, CMolInfo::eCompleteness_complete);
3736     seh = scope.AddTopLevelSeqEntry(*entry);
3737     expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3738                               "CompleteCircleProblem",
3739       "Circular topology has complete flag set, but title should say complete sequence or complete genome"));
3740     expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning,
3741                               "UnwantedCompleteFlag",
3742                               "Suspicious use of complete"));
3743     //AddChromosomeNoLocation(expected_errors, entry);
3744 
3745     eval = validator.Validate(seh, options);
3746     CheckErrors (*eval, expected_errors);
3747 
3748     CLEAR_ERRORS
3749 }
3750 
3751 
BOOST_AUTO_TEST_CASE(Test_BadHTGSeq)3752 BOOST_AUTO_TEST_CASE(Test_BadHTGSeq)
3753 {
3754     // prepare entry
3755     CRef<CSeq_entry> delta_entry = unit_test_util::BuildGoodDeltaSeq();
3756     // remove gaps
3757     unit_test_util::RemoveDeltaSeqGaps (delta_entry);
3758 
3759     STANDARD_SETUP_NAME(delta_entry)
3760 
3761     SetTech(delta_entry, CMolInfo::eTech_htgs_2);
3762     //AddChromosomeNoLocation(expected_errors, delta_entry);
3763     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingGaps", "HTGS delta seq should have gaps between all sequence runs"));
3764     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 delta seq has no gaps and no graphs"));
3765     eval = validator.Validate(seh, options);
3766     CheckErrors (*eval, expected_errors);
3767 
3768     delete expected_errors[1];
3769     expected_errors.pop_back();
3770 
3771     // HTGS_ACTIVEFIN keyword disables BadHTGSeq error
3772     AddGenbankKeyword(delta_entry, "HTGS_ACTIVEFIN");
3773     eval = validator.Validate(seh, options);
3774     CheckErrors (*eval, expected_errors);
3775 
3776     CLEAR_ERRORS
3777 
3778     scope.RemoveTopLevelSeqEntry(seh);
3779     CRef<CSeq_entry> raw_entry = unit_test_util::BuildGoodSeq();
3780     SetTech(raw_entry, CMolInfo::eTech_htgs_2);
3781     seh = scope.AddTopLevelSeqEntry(*raw_entry);
3782     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq", "HTGS 2 raw seq has no gaps and no graphs"));
3783     //AddChromosomeNoLocation(expected_errors, raw_entry);
3784     eval = validator.Validate(seh, options);
3785     CheckErrors (*eval, expected_errors);
3786 
3787     CLEAR_ERRORS
3788 
3789     // HTGS_ACTIVEFIN keyword disables error
3790     AddGenbankKeyword(raw_entry, "HTGS_ACTIVEFIN");
3791     //AddChromosomeNoLocation(expected_errors, raw_entry);
3792     eval = validator.Validate(seh, options);
3793     CheckErrors (*eval, expected_errors);
3794 
3795 
3796     // htg3 errors
3797     SetTech(raw_entry, CMolInfo::eTech_htgs_3);
3798     AddGenbankKeyword(raw_entry, "HTGS_DRAFT");
3799     AddGenbankKeyword(raw_entry, "HTGS_PREFIN");
3800     AddGenbankKeyword(raw_entry, "HTGS_FULLTOP");
3801     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_DRAFT keyword"));
3802     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_PREFIN keyword"));
3803     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_ACTIVEFIN keyword"));
3804     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadHTGSeq", "HTGS 3 sequence should not have HTGS_FULLTOP keyword"));
3805     eval = validator.Validate(seh, options);
3806     CheckErrors (*eval, expected_errors);
3807 
3808     scope.RemoveTopLevelSeqEntry(seh);
3809     seh = scope.AddTopLevelSeqEntry(*delta_entry);
3810     SetTech(delta_entry, CMolInfo::eTech_htgs_3);
3811     AddGenbankKeyword(delta_entry, "HTGS_DRAFT");
3812     AddGenbankKeyword(delta_entry, "HTGS_PREFIN");
3813     AddGenbankKeyword(delta_entry, "HTGS_FULLTOP");
3814     eval = validator.Validate(seh, options);
3815     CheckErrors (*eval, expected_errors);
3816 
3817     CLEAR_ERRORS
3818 }
3819 
3820 
BOOST_AUTO_TEST_CASE(Test_GapInProtein_and_BadProteinStart)3821 BOOST_AUTO_TEST_CASE(Test_GapInProtein_and_BadProteinStart)
3822 {
3823     // prepare entry
3824     CRef<CSeq_entry> entry = unit_test_util::BuildGoodProtSeq();
3825     entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("PRK-EIN");
3826 
3827     STANDARD_SETUP
3828 
3829     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GapInProtein", "[1] internal gap symbols in protein sequence (gene? - fake protein name)"));
3830     //AddChromosomeNoLocation(expected_errors, entry);
3831     eval = validator.Validate(seh, options);
3832     CheckErrors (*eval, expected_errors);
3833 
3834     CLEAR_ERRORS
3835 
3836     entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("-RKTEIN");
3837     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadProteinStart", "gap symbol at start of protein sequence (gene? - fake protein name)"));
3838     //AddChromosomeNoLocation(expected_errors, entry);
3839     eval = validator.Validate(seh, options);
3840     CheckErrors (*eval, expected_errors);
3841 
3842     entry->SetSeq().SetInst().SetSeq_data().SetNcbieaa().Set("-RK-EIN");
3843     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GapInProtein", "[1] internal gap symbols in protein sequence (gene? - fake protein name)"));
3844     eval = validator.Validate(seh, options);
3845     CheckErrors (*eval, expected_errors);
3846 
3847     CLEAR_ERRORS
3848 }
3849 
3850 
BOOST_AUTO_TEST_CASE(Test_TerminalGap)3851 BOOST_AUTO_TEST_CASE(Test_TerminalGap)
3852 {
3853     // prepare entry
3854     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
3855     CRef<CDelta_seq> first_seg(new CDelta_seq());
3856     first_seg->SetLiteral().SetLength(9);
3857     entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_front(first_seg);
3858     CRef<CDelta_seq> last_seg(new CDelta_seq());
3859     last_seg->SetLiteral().SetLength(9);
3860     entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(last_seg);
3861     entry->SetSeq().SetInst().SetLength(entry->SetSeq().SetInst().GetLength() + 18);
3862 
3863     STANDARD_SETUP
3864 
3865     // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "First delta seq component is a gap"));
3866     // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDeltaSeq", "Last delta seq component is a gap"));
3867     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at beginning of sequence"));
3868     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TerminalGap", "Gap at end of sequence"));
3869     /*
3870     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
3871         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3872     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
3873         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3874     */
3875     //AddChromosomeNoLocation(expected_errors, entry);
3876 
3877     eval = validator.Validate(seh, options);
3878     CheckErrors (*eval, expected_errors);
3879 
3880     // if gap length is 10, severity is still warning because still all local IDS
3881     scope.RemoveTopLevelSeqEntry(seh);
3882     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetLength(10);
3883     entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetLength(10);
3884     entry->SetSeq().SetInst().SetLength(entry->SetSeq().SetInst().GetLength() + 2);
3885     seh = scope.AddTopLevelSeqEntry(*entry);
3886     eval = validator.Validate(seh, options);
3887     CheckErrors (*eval, expected_errors);
3888 
3889 
3890     scope.RemoveTopLevelSeqEntry(seh);
3891     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
3892     seh = scope.AddTopLevelSeqEntry(*entry);
3893     ChangeErrorAcc(expected_errors, "ref|NC_123456|");
3894     /*
3895     expected_errors[2]->SetSeverity(eDiag_Warning);
3896     expected_errors[3]->SetSeverity(eDiag_Warning);
3897     */
3898     eval = validator.Validate(seh, options);
3899     CheckErrors (*eval, expected_errors);
3900 
3901     scope.RemoveTopLevelSeqEntry(seh);
3902     entry->SetSeq().SetId().front()->SetPatent().SetSeqid(1);
3903     entry->SetSeq().SetId().front()->SetPatent().SetCit().SetCountry("USA");
3904     entry->SetSeq().SetId().front()->SetPatent().SetCit().SetId().SetNumber("1");
3905     seh = scope.AddTopLevelSeqEntry(*entry);
3906     ChangeErrorAcc(expected_errors, "pat|USA|1|1");
3907     eval = validator.Validate(seh, options);
3908     CheckErrors (*eval, expected_errors);
3909 
3910     CLEAR_ERRORS
3911 
3912     // no more terminal gap warnings if circular - changed to still show first/last delta component
3913     entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
3914     unit_test_util::SetCompleteness(entry, CMolInfo::eCompleteness_complete);
3915     expected_errors.push_back(new CExpectedError("pat|USA|1|1", eDiag_Warning, "UnwantedCompleteFlag",
3916                               "Suspicious use of complete"));
3917     //AddChromosomeNoLocation(expected_errors, entry);
3918 
3919     eval = validator.Validate(seh, options);
3920     CheckErrors (*eval, expected_errors);
3921     CLEAR_ERRORS
3922 }
3923 
3924 
BOOST_FIXTURE_TEST_CASE(Test_OverlappingDeltaRange,CGenBankFixture)3925 BOOST_FIXTURE_TEST_CASE(Test_OverlappingDeltaRange, CGenBankFixture)
3926 {
3927     // prepare entry
3928     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
3929     entry->SetSeq().SetInst().ResetExt();
3930     CRef<CSeq_id> seqid(new CSeq_id());
3931     seqid->SetGenbank().SetAccession("AY123456");
3932     entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 0, 10);
3933     entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 5, 15);
3934     entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 20, 30);
3935     entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*seqid, 25, 35);
3936     entry->SetSeq().SetInst().SetLength(44);
3937 
3938     STANDARD_SETUP
3939 
3940     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OverlappingDeltaRange", "Overlapping delta range 6-16 and 1-11 on a Bioseq gb|AY123456|"));
3941     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OverlappingDeltaRange", "Overlapping delta range 26-36 and 21-31 on a Bioseq gb|AY123456|"));
3942     //AddChromosomeNoLocation(expected_errors, entry);
3943     eval = validator.Validate(seh, options);
3944     CheckErrors (*eval, expected_errors);
3945 
3946     CLEAR_ERRORS
3947 }
3948 
3949 
BOOST_AUTO_TEST_CASE(Test_LeadingX)3950 BOOST_AUTO_TEST_CASE(Test_LeadingX)
3951 {
3952     // prepare entry
3953     CRef<CSeq_entry> entry = unit_test_util::BuildGoodProtSeq();
3954     entry->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("XROTEIN");
3955 
3956     STANDARD_SETUP
3957 
3958     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LeadingX", "Sequence starts with leading X"));
3959     //AddChromosomeNoLocation(expected_errors, entry);
3960     eval = validator.Validate(seh, options);
3961     CheckErrors (*eval, expected_errors);
3962 
3963     CLEAR_ERRORS
3964 }
3965 
3966 
BOOST_AUTO_TEST_CASE(Test_InternalNsInSeqRaw)3967 BOOST_AUTO_TEST_CASE(Test_InternalNsInSeqRaw)
3968 {
3969     // prepare entry
3970     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
3971     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTTT");
3972     entry->SetSeq().SetInst().SetLength(110);
3973 
3974     STANDARD_SETUP
3975 
3976     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqRaw", "Run of 100 Ns in raw sequence starting at base 6"));
3977     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 90 percent Ns"));
3978     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
3979         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3980     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
3981         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3982     //AddChromosomeNoLocation(expected_errors, entry);
3983     eval = validator.Validate(seh, options);
3984     CheckErrors (*eval, expected_errors);
3985 
3986     CLEAR_ERRORS
3987 
3988     // expect no InternalNsInSeqRaw error
3989     scope.RemoveTopLevelSeqEntry(seh);
3990     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAANNNNNNNNNNNNNNNNNNNNTTTTT");
3991     entry->SetSeq().SetInst().SetLength(30);
3992     seh = scope.AddTopLevelSeqEntry(*entry);
3993     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 66 percent Ns"));
3994     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
3995         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
3996     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
3997         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
3998     //AddChromosomeNoLocation(expected_errors, entry);
3999     eval = validator.Validate(seh, options);
4000     CheckErrors (*eval, expected_errors);
4001 
4002     CLEAR_ERRORS
4003 
4004     // WGS has lower threshold
4005     SetTech (entry, CMolInfo::eTech_wgs);
4006     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalNsInSeqRaw", "Run of 20 Ns in raw sequence starting at base 6"));
4007     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 66 percent Ns"));
4008     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4009         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4010     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4011         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4012     AddChromosomeNoLocation(expected_errors, entry);
4013     eval = validator.Validate(seh, options);
4014     CheckErrors (*eval, expected_errors);
4015 
4016     CLEAR_ERRORS
4017 }
4018 
4019 
BOOST_AUTO_TEST_CASE(Test_InternalNsAdjacentToGap)4020 BOOST_AUTO_TEST_CASE(Test_InternalNsAdjacentToGap)
4021 {
4022     // prepare entry
4023     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
4024     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral().SetSeq_data().SetIupacna().Set("ATGATGATGNNN");
4025     entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("NNNATGATGATG");
4026 
4027     STANDARD_SETUP
4028 
4029     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InternalNsAdjacentToGap", "Ambiguous residue N is adjacent to a gap around position 13"));
4030     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InternalNsAdjacentToGap", "Ambiguous residue N is adjacent to a gap around position 23"));
4031     /*
4032     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4033         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4034     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4035         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4036     */
4037     //AddChromosomeNoLocation(expected_errors, entry);
4038 
4039     eval = validator.Validate(seh, options);
4040     CheckErrors (*eval, expected_errors);
4041 
4042     CLEAR_ERRORS
4043 }
4044 
4045 
BOOST_AUTO_TEST_CASE(Test_DeltaComponentIsGi0)4046 BOOST_AUTO_TEST_CASE(Test_DeltaComponentIsGi0)
4047 {
4048     // prepare entry
4049     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
4050     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
4051     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(11);
4052     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGi(ZERO_GI);
4053 
4054     STANDARD_SETUP
4055 
4056     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "DeltaComponentIsGi0", "Delta component is gi|0"));
4057     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DeltaSeqError", "Unable to find far delta sequence component"));
4058     //AddChromosomeNoLocation(expected_errors, entry);
4059 
4060     eval = validator.Validate(seh, options);
4061     CheckErrors (*eval, expected_errors);
4062 
4063     CLEAR_ERRORS
4064 }
4065 
4066 
BOOST_AUTO_TEST_CASE(Test_InternalGapsInSeqRaw)4067 BOOST_AUTO_TEST_CASE(Test_InternalGapsInSeqRaw)
4068 {
4069     // prepare entry
4070     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
4071     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGGCCAAAATTGGCCAAAATTGG-CAAAATTGGCCAAAATTGGCCAAAATTGGCCAA");
4072 
4073     STANDARD_SETUP
4074 
4075     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "InvalidResidue", "Invalid residue '-' at position [27]"));
4076     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InternalGapsInSeqRaw", "Raw nucleotide should not contain gap characters"));
4077     //AddChromosomeNoLocation(expected_errors, entry);
4078 
4079     eval = validator.Validate(seh, options);
4080     CheckErrors (*eval, expected_errors);
4081 
4082     CLEAR_ERRORS
4083 }
4084 
4085 
BOOST_AUTO_TEST_CASE(Test_SelfReferentialSequence)4086 BOOST_AUTO_TEST_CASE(Test_SelfReferentialSequence)
4087 {
4088     // prepare entry
4089     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
4090     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
4091     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(11);
4092     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetLocal().SetStr("good");
4093 
4094     STANDARD_SETUP
4095 
4096     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "SelfReferentialSequence", "Self-referential delta sequence"));
4097     //AddChromosomeNoLocation(expected_errors, entry);
4098 
4099     eval = validator.Validate(seh, options);
4100     CheckErrors (*eval, expected_errors);
4101 
4102     CLEAR_ERRORS
4103 }
4104 
4105 
BOOST_FIXTURE_TEST_CASE(Test_WholeComponent,CGenBankFixture)4106 BOOST_FIXTURE_TEST_CASE(Test_WholeComponent, CGenBankFixture)
4107 {
4108     // prepare entry
4109     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
4110     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetWhole().SetGenbank().SetAccession("AY123456");
4111     entry->SetSeq().SetInst().SetLength(507);
4112 
4113     STANDARD_SETUP
4114 
4115     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "WholeComponent", "Delta seq component should not be of type whole"));
4116     //AddChromosomeNoLocation(expected_errors, entry);
4117 
4118     eval = validator.Validate(seh, options);
4119     CheckErrors (*eval, expected_errors);
4120 
4121     CLEAR_ERRORS
4122 }
4123 
4124 
s_AddGeneralAndLocal(CBioseq & seq)4125 void s_AddGeneralAndLocal(CBioseq& seq)
4126 {
4127     CRef<CSeq_id> gnl(new CSeq_id());
4128     gnl->SetGeneral().SetDb("a");
4129     gnl->SetGeneral().SetTag().SetStr("b");
4130     seq.SetId().front()->Assign(*gnl);
4131     CRef<CSeq_id> lcl(new CSeq_id());
4132     lcl->SetLocal().SetStr("x");
4133     seq.SetId().push_back(lcl);
4134     seq.SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().Assign(*gnl);
4135 }
4136 
4137 
BOOST_AUTO_TEST_CASE(Test_ProteinsHaveGeneralID)4138 BOOST_AUTO_TEST_CASE(Test_ProteinsHaveGeneralID)
4139 {
4140     // prepare entry
4141     CRef<CSeq_entry> entry = unit_test_util::BuildGoodProtSeq();
4142     s_AddGeneralAndLocal(entry->SetSeq());
4143 
4144     STANDARD_SETUP
4145 
4146     // no error unless part of nuc-prot set
4147     //AddChromosomeNoLocation(expected_errors,entry);
4148     eval = validator.Validate(seh, options);
4149     CheckErrors (*eval, expected_errors);
4150     CLEAR_ERRORS
4151 
4152     scope.RemoveTopLevelSeqEntry(seh);
4153     entry = unit_test_util::BuildGoodNucProtSet();
4154     CRef<CSeq_entry> prot = GetProteinSequenceFromGoodNucProtSet(entry);
4155     s_AddGeneralAndLocal(prot->SetSeq());
4156 
4157     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
4158     cds->SetProduct().SetWhole().SetGeneral().SetDb("a");
4159     cds->SetProduct().SetWhole().SetGeneral().SetTag().SetStr("b");
4160     seh = scope.AddTopLevelSeqEntry(*entry);
4161 
4162     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "ProteinsHaveGeneralID", "INDEXER_ONLY - Protein bioseqs have general seq-id."));
4163     //AddChromosomeNoLocation(expected_errors, entry);
4164 
4165     eval = validator.Validate(seh, options);
4166     CheckErrors (*eval, expected_errors);
4167 
4168     CLEAR_ERRORS
4169 }
4170 
4171 
BOOST_AUTO_TEST_CASE(Test_HighNContentPercent_and_HighNContentStretch)4172 BOOST_AUTO_TEST_CASE(Test_HighNContentPercent_and_HighNContentStretch)
4173 {
4174     // prepare entry
4175     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
4176     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCNNNNNNNNNNNAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTT");
4177     entry->SetSeq().SetInst().SetLength(100);
4178     SetTech (entry, CMolInfo::eTech_tsa);
4179     unit_test_util::SetBiomol (entry, CMolInfo::eBiomol_mRNA);
4180     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
4181 
4182     STANDARD_SETUP
4183 
4184     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent", "Sequence contains 11 percent Ns"));
4185     //AddChromosomeNoLocation(expected_errors, entry);
4186     eval = validator.Validate(seh, options);
4187     CheckErrors (*eval, expected_errors);
4188 
4189     scope.RemoveTopLevelSeqEntry(seh);
4190     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCNNNNNNNNNNNNNNNNTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTT");
4191     seh = scope.AddTopLevelSeqEntry(*entry);
4192     expected_errors[0]->SetErrMsg("Sequence contains 16 percent Ns");
4193     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentStretch", "Sequence has a stretch of 16 Ns"));
4194     eval = validator.Validate(seh, options);
4195     CheckErrors (*eval, expected_errors);
4196 
4197     CLEAR_ERRORS
4198 
4199     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentStretch", "Sequence has a stretch of 16 Ns"));
4200     eval = validator.GetTSANStretchErrors(seh);
4201     CheckErrors (*eval, expected_errors);
4202     eval = validator.GetTSANStretchErrors(entry->GetSeq());
4203     CheckErrors (*eval, expected_errors);
4204 
4205     CLEAR_ERRORS
4206 
4207     scope.RemoveTopLevelSeqEntry(seh);
4208     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AANNNNNNNNNNGGGCCCCCAAAAATTTTTGGGGGCCCCCAAAAATTTTTGGGGGTTTTTGGGGGCCCCCAAAAATTTTTGGGGGCCNNNNNNNNNNAAA");
4209     seh = scope.AddTopLevelSeqEntry(*entry);
4210     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent5Prime",
4211         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4212     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "HighNpercent3Prime",
4213         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4214     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent",
4215         "Sequence contains 20 percent Ns"));
4216     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent5Prime",
4217         "Sequence has a stretch of at least 10 Ns within the first 20 bases"));
4218     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent3Prime",
4219         "Sequence has a stretch of at least 10 Ns within the last 20 bases"));
4220     //AddChromosomeNoLocation(expected_errors, entry);
4221     eval = validator.Validate(seh, options);
4222     CheckErrors (*eval, expected_errors);
4223 
4224     CLEAR_ERRORS
4225 
4226     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent5Prime", "Sequence has a stretch of at least 10 Ns within the first 20 bases"));
4227     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNcontent3Prime", "Sequence has a stretch of at least 10 Ns within the last 20 bases"));
4228     eval = validator.GetTSANStretchErrors(seh);
4229     CheckErrors (*eval, expected_errors);
4230     eval = validator.GetTSANStretchErrors(entry->GetSeq());
4231     CheckErrors (*eval, expected_errors);
4232 
4233     CLEAR_ERRORS
4234 
4235     scope.RemoveTopLevelSeqEntry(seh);
4236     entry = unit_test_util::BuildGoodDeltaSeq();
4237     CRef<objects::CDelta_seq> gap_seg(new objects::CDelta_seq());
4238     gap_seg->SetLiteral().SetSeq_data().SetGap();
4239     gap_seg->SetLiteral().SetLength(10);
4240     entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
4241     entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGA", objects::CSeq_inst::eMol_dna);
4242     entry->SetSeq().SetInst().SetLength(entry->GetSeq().GetInst().GetLength() + 20);
4243     seh = scope.AddTopLevelSeqEntry(*entry);
4244 
4245     /*
4246     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4247         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4248     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4249         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4250     */
4251     //AddChromosomeNoLocation(expected_errors, entry);
4252 
4253     eval = validator.Validate(seh, options);
4254     CheckErrors (*eval, expected_errors);
4255 
4256     CLEAR_ERRORS
4257 }
4258 
4259 
BOOST_AUTO_TEST_CASE(Test_SeqLitDataLength0)4260 BOOST_AUTO_TEST_CASE(Test_SeqLitDataLength0)
4261 {
4262     // prepare entry
4263     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
4264 
4265     CDelta_ext::Tdata::iterator seg_it = entry->SetSeq().SetInst().SetExt().SetDelta().Set().begin();
4266     ++seg_it;
4267     (*seg_it)->SetLiteral().SetSeq_data().SetIupacna().Set();
4268     (*seg_it)->SetLiteral().SetLength(0);
4269 
4270     entry->SetSeq().SetInst().SetLength(24);
4271 
4272     STANDARD_SETUP
4273 
4274     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLitDataLength0", "Seq-lit of length 0 in delta chain"));
4275     //AddChromosomeNoLocation(expected_errors, entry);
4276     eval = validator.Validate(seh, options);
4277     CheckErrors (*eval, expected_errors);
4278 
4279     CLEAR_ERRORS
4280 }
4281 
4282 
BuildGapFuzz100DeltaSeq(void)4283 static CRef<CSeq_entry> BuildGapFuzz100DeltaSeq(void)
4284 {
4285     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
4286 
4287     entry->SetSeq().SetInst().ResetSeq_data();
4288     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
4289     entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("ATGATGATGCCC", CSeq_inst::eMol_dna);
4290     CRef<CDelta_seq> gap_seg(new CDelta_seq());
4291     gap_seg->SetLiteral().SetLength(101);
4292     gap_seg->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
4293     entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
4294     entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGATG", CSeq_inst::eMol_dna);
4295     entry->SetSeq().SetInst().SetLength(125);
4296 
4297     return entry;
4298 }
4299 
4300 
BOOST_AUTO_TEST_CASE(Test_UnknownLengthGapNot100)4301 BOOST_AUTO_TEST_CASE(Test_UnknownLengthGapNot100)
4302 {
4303     CRef<CSeq_entry> entry = BuildGapFuzz100DeltaSeq();
4304 
4305     STANDARD_SETUP
4306 
4307     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnknownLengthGapNot100", "Gap of unknown length should have length 100"));
4308     /*
4309     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
4310         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
4311     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
4312         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
4313     */
4314     //AddChromosomeNoLocation(expected_errors, entry);
4315     eval = validator.Validate(seh, options);
4316     CheckErrors (*eval, expected_errors);
4317 
4318     CLEAR_ERRORS
4319 }
4320 
4321 
BOOST_AUTO_TEST_CASE(Test_DSmRNA)4322 BOOST_AUTO_TEST_CASE(Test_DSmRNA)
4323 {
4324     // prepare entry
4325     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
4326     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
4327     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_mRNA);
4328     entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ds);
4329 
4330     STANDARD_SETUP
4331 
4332     // double strand
4333     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "mRNAshouldBeSingleStranded", "mRNA should be single stranded not double stranded"));
4334     //AddChromosomeNoLocation(expected_errors, entry);
4335     eval = validator.Validate(seh, options);
4336     CheckErrors (*eval, expected_errors);
4337 
4338     // mixed strand
4339     entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_mixed);
4340     eval = validator.Validate(seh, options);
4341     CheckErrors (*eval, expected_errors);
4342 
4343     // mixed strand
4344     entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_other);
4345     eval = validator.Validate(seh, options);
4346     CheckErrors (*eval, expected_errors);
4347 
4348     CLEAR_ERRORS
4349 
4350     // these should not produce errors
4351 
4352     // strand not set
4353     entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_not_set);
4354     eval = validator.Validate(seh, options);
4355     //AddChromosomeNoLocation(expected_errors, entry);
4356 
4357     CheckErrors (*eval, expected_errors);
4358 
4359     entry->SetSeq().SetInst().ResetStrand();
4360     eval = validator.Validate(seh, options);
4361     CheckErrors (*eval, expected_errors);
4362 
4363     // single strand
4364     entry->SetSeq().SetInst().SetStrand(CSeq_inst::eStrand_ss);
4365     eval = validator.Validate(seh, options);
4366     CheckErrors (*eval, expected_errors);
4367 
4368     CLEAR_ERRORS
4369 }
4370 
4371 
BOOST_AUTO_TEST_CASE(Test_BioSourceMissing)4372 BOOST_AUTO_TEST_CASE(Test_BioSourceMissing)
4373 {
4374     // prepare entry
4375     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
4376     unit_test_util::RemoveDescriptorType (entry, CSeqdesc::e_Source);
4377     unit_test_util::AddGoodSource (entry->SetSet().SetSeq_set().front());
4378 
4379     STANDARD_SETUP
4380 
4381     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceMissing", "Nuc-prot set does not contain expected BioSource descriptor"));
4382     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Fatal, "NoOrgFound", "No organism name included in the source. Other qualifiers may exist."));
4383     //AddChromosomeNoLocation(expected_errors, entry);
4384 
4385     eval = validator.Validate(seh, options);
4386     CheckErrors (*eval, expected_errors);
4387 
4388     CLEAR_ERRORS
4389 }
4390 
4391 
BOOST_AUTO_TEST_CASE(Test_Descr_InvalidForType)4392 BOOST_AUTO_TEST_CASE(Test_Descr_InvalidForType)
4393 {
4394     // prepare entry
4395     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
4396     CRef<CSeqdesc> desc;
4397     desc.Reset(new CSeqdesc());
4398     desc->SetMol_type(eGIBB_mol_genomic);
4399     entry->SetDescr().Set().push_back(desc);
4400     desc.Reset(new CSeqdesc());
4401     desc->SetModif().push_back(eGIBB_mod_dna);
4402     entry->SetDescr().Set().push_back(desc);
4403     desc.Reset(new CSeqdesc());
4404     desc->SetMethod(eGIBB_method_other);
4405     entry->SetDescr().Set().push_back(desc);
4406     desc.Reset(new CSeqdesc());
4407     desc->SetOrg().SetTaxname("Sebaea microphylla");
4408     entry->SetDescr().Set().push_back(desc);
4409     AddTpaAssemblyUserObject (entry);
4410 
4411     STANDARD_SETUP
4412 
4413     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide",
4414                               "Nucleic acid with protein sequence method"));
4415     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4416                               "MolType descriptor is obsolete"));
4417     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4418                               "Modif descriptor is obsolete"));
4419     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4420                               "Method descriptor is obsolete"));
4421     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4422                               "OrgRef descriptor is obsolete"));
4423     //AddChromosomeNoLocation(expected_errors, entry);
4424 
4425     // won't complain about TPA assembly if only local ID
4426     eval = validator.Validate(seh, options);
4427     CheckErrors (*eval, expected_errors);
4428 
4429     CLEAR_ERRORS
4430 
4431     scope.RemoveTopLevelSeqEntry(seh);
4432     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
4433     unit_test_util::RemoveDescriptorType (entry, CSeqdesc::e_Mol_type);
4434     unit_test_util::RemoveDescriptorType (entry, CSeqdesc::e_Modif);
4435     unit_test_util::RemoveDescriptorType (entry, CSeqdesc::e_Method);
4436     unit_test_util::RemoveDescriptorType (entry, CSeqdesc::e_Org);
4437     seh = scope.AddTopLevelSeqEntry(*entry);
4438     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TPAassemblyWithoutTPAKeyword",
4439                               "Non-TPA record gb|AY123456| should not have TpaAssembly object"));
4440     //AddChromosomeNoLocation(expected_errors, entry);
4441     SetErrorsAccessions(expected_errors, "gb|AY123456|");
4442     eval = validator.Validate(seh, options);
4443     CheckErrors (*eval, expected_errors);
4444 
4445     scope.RemoveTopLevelSeqEntry(seh);
4446     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
4447     seh = scope.AddTopLevelSeqEntry(*entry);
4448     SetErrorsAccessions(expected_errors, "ref|NC_123456|");
4449     expected_errors[0]->SetErrMsg("Non-TPA record ref|NC_123456| should not have TpaAssembly object");
4450     eval = validator.Validate(seh, options);
4451     CheckErrors (*eval, expected_errors);
4452 
4453     desc.Reset(new CSeqdesc());
4454     desc->SetMol_type(eGIBB_mol_peptide);
4455     entry->SetDescr().Set().push_back(desc);
4456     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "InvalidForTypeGIBB",
4457                               "Nucleic acid with GIBB-mol = peptide"));
4458     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "InvalidForType",
4459                               "MolType descriptor is obsolete"));
4460     eval = validator.Validate(seh, options);
4461     CheckErrors (*eval, expected_errors);
4462 
4463     desc->SetMol_type(eGIBB_mol_other);
4464     expected_errors[1]->SetErrMsg("GIBB-mol unknown or other used");
4465     eval = validator.Validate(seh, options);
4466     CheckErrors (*eval, expected_errors);
4467 
4468     desc->SetMol_type(eGIBB_mol_unknown);
4469     eval = validator.Validate(seh, options);
4470     CheckErrors (*eval, expected_errors);
4471 
4472     CLEAR_ERRORS
4473 
4474     scope.RemoveTopLevelSeqEntry(seh);
4475     entry = unit_test_util::BuildGoodProtSeq();
4476     desc.Reset(new CSeqdesc());
4477     desc->SetMol_type(eGIBB_mol_genomic);
4478     entry->SetDescr().Set().push_back(desc);
4479     seh = scope.AddTopLevelSeqEntry(*entry);
4480     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4481                               "GIBB-mol [1] used on protein"));
4482     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4483                               "MolType descriptor is obsolete"));
4484     //AddChromosomeNoLocation(expected_errors, entry);
4485     eval = validator.Validate(seh, options);
4486     CheckErrors (*eval, expected_errors);
4487 
4488     desc->SetMol_type(eGIBB_mol_pre_mRNA);
4489     expected_errors[0]->SetErrMsg("GIBB-mol [2] used on protein");
4490     eval = validator.Validate(seh, options);
4491     CheckErrors (*eval, expected_errors);
4492 
4493     desc->SetMol_type(eGIBB_mol_mRNA);
4494     expected_errors[0]->SetErrMsg("GIBB-mol [3] used on protein");
4495     eval = validator.Validate(seh, options);
4496     CheckErrors (*eval, expected_errors);
4497 
4498     desc->SetMol_type(eGIBB_mol_rRNA);
4499     expected_errors[0]->SetErrMsg("GIBB-mol [4] used on protein");
4500     eval = validator.Validate(seh, options);
4501     CheckErrors (*eval, expected_errors);
4502 
4503     desc->SetMol_type(eGIBB_mol_tRNA);
4504     expected_errors[0]->SetErrMsg("GIBB-mol [5] used on protein");
4505     eval = validator.Validate(seh, options);
4506     CheckErrors (*eval, expected_errors);
4507 
4508     desc->SetMol_type(eGIBB_mol_snRNA);
4509     expected_errors[0]->SetErrMsg("GIBB-mol [6] used on protein");
4510     eval = validator.Validate(seh, options);
4511     CheckErrors (*eval, expected_errors);
4512 
4513     desc->SetMol_type(eGIBB_mol_scRNA);
4514     expected_errors[0]->SetErrMsg("GIBB-mol [7] used on protein");
4515     eval = validator.Validate(seh, options);
4516     CheckErrors (*eval, expected_errors);
4517 
4518     desc->SetMol_type(eGIBB_mol_other_genetic);
4519     expected_errors[0]->SetErrMsg("GIBB-mol [9] used on protein");
4520     eval = validator.Validate(seh, options);
4521     CheckErrors (*eval, expected_errors);
4522 
4523     desc->SetMol_type(eGIBB_mol_genomic_mRNA);
4524     expected_errors[0]->SetErrMsg("GIBB-mol [10] used on protein");
4525     eval = validator.Validate(seh, options);
4526     CheckErrors (*eval, expected_errors);
4527 
4528     CLEAR_ERRORS
4529 
4530     // invalid modif
4531     desc->SetModif().push_back(eGIBB_mod_dna);
4532     desc->SetModif().push_back(eGIBB_mod_rna);
4533     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4534                               "Nucleic acid GIBB-mod [0] on protein"));
4535     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForTypeGIBB",
4536                               "Nucleic acid GIBB-mod [1] on protein"));
4537     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4538                               "Modif descriptor is obsolete"));
4539     //AddChromosomeNoLocation(expected_errors, entry);
4540     eval = validator.Validate(seh, options);
4541     CheckErrors (*eval, expected_errors);
4542 
4543     CLEAR_ERRORS
4544 
4545     scope.RemoveTopLevelSeqEntry(seh);
4546     entry = unit_test_util::BuildGoodSeq();
4547     NON_CONST_ITERATE (CSeq_descr::Tdata, it, entry->SetSeq().SetDescr().Set()) {
4548         if ((*it)->IsSource()) {
4549             (*it)->SetSource().SetOrigin(CBioSource::eOrigin_synthetic);
4550         }
4551     }
4552     seh = scope.AddTopLevelSeqEntry(*entry);
4553     // if biomol not other, should generate error
4554     expected_errors.push_back(new CExpectedError ("lcl|good", eDiag_Warning, "InvalidForType",
4555                                                   "Molinfo-biomol other should be used if Biosource-location is synthetic"));
4556     //AddChromosomeNoLocation(expected_errors, entry);
4557     eval = validator.Validate(seh, options);
4558     CheckErrors (*eval, expected_errors);
4559 
4560     CLEAR_ERRORS
4561 
4562     NON_CONST_ITERATE (CSeq_descr::Tdata, it, entry->SetSeq().SetDescr().Set()) {
4563         if ((*it)->IsSource()) {
4564             (*it)->SetSource().ResetOrigin();
4565         }
4566     }
4567 
4568     unit_test_util::SetBiomol (entry, CMolInfo::eBiomol_peptide);
4569     expected_errors.push_back(new CExpectedError ("lcl|good", eDiag_Error, "InvalidMolInfo",
4570                                                   "Nucleic acid with Molinfo = peptide"));
4571     //AddChromosomeNoLocation(expected_errors, entry);
4572     eval = validator.Validate(seh, options);
4573     CheckErrors (*eval, expected_errors);
4574     CLEAR_ERRORS
4575 
4576     unit_test_util::SetBiomol (entry, CMolInfo::eBiomol_other_genetic);
4577     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
4578         "MoltypeOtherGenetic", "Molinfo-biomol = other genetic"));
4579     //AddChromosomeNoLocation(expected_errors, entry);
4580     eval = validator.Validate(seh, options);
4581     CheckErrors (*eval, expected_errors);
4582     CLEAR_ERRORS
4583 
4584     unit_test_util::SetBiomol (entry, CMolInfo::eBiomol_unknown);
4585     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
4586         "MoltypeUnknown", "Molinfo-biomol unknown used"));
4587     //AddChromosomeNoLocation(expected_errors, entry);
4588     eval = validator.Validate(seh, options);
4589     CheckErrors (*eval, expected_errors);
4590     CLEAR_ERRORS
4591 
4592     unit_test_util::SetBiomol (entry, CMolInfo::eBiomol_other);
4593     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
4594         "MoltypeOther", "Molinfo-biomol other used"));
4595     //AddChromosomeNoLocation(expected_errors, entry);
4596     eval = validator.Validate(seh, options);
4597     CheckErrors (*eval, expected_errors);
4598     CLEAR_ERRORS
4599 
4600     scope.RemoveTopLevelSeqEntry(seh);
4601     entry = unit_test_util::BuildGoodProtSeq();
4602     seh = scope.AddTopLevelSeqEntry(*entry);
4603 
4604     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
4605         "InvalidForType", "Molinfo-biomol [1] used on protein"));
4606     //AddChromosomeNoLocation(expected_errors, entry);
4607     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_genomic);
4608     expected_errors[0]->SetErrMsg("Molinfo-biomol [1] used on protein");
4609     eval = validator.Validate(seh, options);
4610     CheckErrors (*eval, expected_errors);
4611 
4612     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_pre_RNA);
4613     expected_errors[0]->SetErrMsg("Molinfo-biomol [2] used on protein");
4614     eval = validator.Validate(seh, options);
4615     CheckErrors (*eval, expected_errors);
4616 
4617     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_mRNA);
4618     expected_errors[0]->SetErrMsg("Molinfo-biomol [3] used on protein");
4619     eval = validator.Validate(seh, options);
4620     CheckErrors (*eval, expected_errors);
4621 
4622     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_rRNA);
4623     expected_errors[0]->SetErrMsg("Molinfo-biomol [4] used on protein");
4624     eval = validator.Validate(seh, options);
4625     CheckErrors (*eval, expected_errors);
4626 
4627     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_tRNA);
4628     expected_errors[0]->SetErrMsg("Molinfo-biomol [5] used on protein");
4629     eval = validator.Validate(seh, options);
4630     CheckErrors (*eval, expected_errors);
4631 
4632     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_snRNA);
4633     expected_errors[0]->SetErrMsg("Molinfo-biomol [6] used on protein");
4634     eval = validator.Validate(seh, options);
4635     CheckErrors (*eval, expected_errors);
4636 
4637     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_scRNA);
4638     expected_errors[0]->SetErrMsg("Molinfo-biomol [7] used on protein");
4639     eval = validator.Validate(seh, options);
4640     CheckErrors (*eval, expected_errors);
4641 
4642     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_genomic_mRNA);
4643     expected_errors[0]->SetErrMsg("Molinfo-biomol [10] used on protein");
4644     eval = validator.Validate(seh, options);
4645     CheckErrors (*eval, expected_errors);
4646 
4647     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_cRNA);
4648     expected_errors[0]->SetErrMsg("Molinfo-biomol [11] used on protein");
4649     eval = validator.Validate(seh, options);
4650     CheckErrors (*eval, expected_errors);
4651 
4652     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_snoRNA);
4653     expected_errors[0]->SetErrMsg("Molinfo-biomol [12] used on protein");
4654     eval = validator.Validate(seh, options);
4655     CheckErrors (*eval, expected_errors);
4656 
4657     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_transcribed_RNA);
4658     expected_errors[0]->SetErrMsg("Molinfo-biomol [13] used on protein");
4659     eval = validator.Validate(seh, options);
4660     CheckErrors (*eval, expected_errors);
4661 
4662     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_ncRNA);
4663     expected_errors[0]->SetErrMsg("Molinfo-biomol [14] used on protein");
4664     eval = validator.Validate(seh, options);
4665     CheckErrors (*eval, expected_errors);
4666 
4667     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_tmRNA);
4668     expected_errors[0]->SetErrMsg("Molinfo-biomol [15] used on protein");
4669     eval = validator.Validate(seh, options);
4670     CheckErrors (*eval, expected_errors);
4671 
4672     CLEAR_ERRORS
4673 
4674     scope.RemoveTopLevelSeqEntry(seh);
4675     entry = unit_test_util::BuildGoodSeq();
4676     seh = scope.AddTopLevelSeqEntry(*entry);
4677     unit_test_util::SetSynthetic_construct(entry);
4678     expected_errors.push_back(new CExpectedError ("lcl|good", eDiag_Warning, "SyntheticConstructWrongMolType",
4679                                                   "synthetic construct should have other-genetic"));
4680     expected_errors.push_back(new CExpectedError ("lcl|good", eDiag_Warning, "SyntheticConstructNeedsArtificial",
4681                                                   "synthetic construct should have artificial origin"));
4682     //AddChromosomeNoLocation(expected_errors, entry);
4683     eval = validator.Validate(seh, options);
4684     CheckErrors (*eval, expected_errors);
4685 
4686     CLEAR_ERRORS
4687 
4688     unit_test_util::SetSebaea_microphylla(entry);
4689 
4690     SetTech(entry, CMolInfo::eTech_concept_trans);
4691     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProteinTechniqueOnNucleotide",
4692                                                  "Nucleic acid with protein sequence method"));
4693     //AddChromosomeNoLocation(expected_errors, entry);
4694     eval = validator.Validate(seh, options);
4695     CheckErrors (*eval, expected_errors);
4696 
4697     SetTech(entry, CMolInfo::eTech_seq_pept);
4698     eval = validator.Validate(seh, options);
4699     CheckErrors (*eval, expected_errors);
4700 
4701     SetTech(entry, CMolInfo::eTech_both);
4702     eval = validator.Validate(seh, options);
4703     CheckErrors (*eval, expected_errors);
4704 
4705     SetTech(entry, CMolInfo::eTech_seq_pept_overlap);
4706     eval = validator.Validate(seh, options);
4707     CheckErrors (*eval, expected_errors);
4708 
4709     SetTech(entry, CMolInfo::eTech_seq_pept_homol);
4710     eval = validator.Validate(seh, options);
4711     CheckErrors (*eval, expected_errors);
4712 
4713     SetTech(entry, CMolInfo::eTech_concept_trans_a);
4714     eval = validator.Validate(seh, options);
4715     CheckErrors (*eval, expected_errors);
4716 
4717     CLEAR_ERRORS
4718 
4719     scope.RemoveTopLevelSeqEntry(seh);
4720     entry = unit_test_util::BuildGoodProtSeq();
4721     seh = scope.AddTopLevelSeqEntry(*entry);
4722     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
4723         "NucleotideTechniqueOnProtein", "Protein with nucleic acid sequence method"));
4724     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ESTshouldBemRNA",
4725                                                  "EST sequence should be mRNA"));
4726 
4727     //AddChromosomeNoLocation(expected_errors, entry);
4728     SetTech(entry, CMolInfo::eTech_est);
4729     eval = validator.Validate(seh, options);
4730     CheckErrors (*eval, expected_errors);
4731 
4732     CLEAR_ERRORS
4733 
4734     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NucleotideTechniqueOnProtein",
4735                                                  "Protein with nucleic acid sequence method"));
4736     //AddChromosomeNoLocation(expected_errors, entry);
4737     SetTech(entry, CMolInfo::eTech_genemap);
4738     eval = validator.Validate(seh, options);
4739     CheckErrors (*eval, expected_errors);
4740 
4741     SetTech(entry, CMolInfo::eTech_physmap);
4742     eval = validator.Validate(seh, options);
4743     CheckErrors (*eval, expected_errors);
4744 
4745     SetTech(entry, CMolInfo::eTech_fli_cdna);
4746     eval = validator.Validate(seh, options);
4747     CheckErrors (*eval, expected_errors);
4748 
4749     SetTech(entry, CMolInfo::eTech_htc);
4750     eval = validator.Validate(seh, options);
4751     CheckErrors (*eval, expected_errors);
4752 
4753     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "HTGS_STS_GSS_WGSshouldBeGenomic",
4754                                                  "HTGS/STS/GSS/WGS sequence should be genomic"));
4755     SetTech(entry, CMolInfo::eTech_sts);
4756     eval = validator.Validate(seh, options);
4757     CheckErrors (*eval, expected_errors);
4758 
4759     SetTech(entry, CMolInfo::eTech_htgs_1);
4760     eval = validator.Validate(seh, options);
4761     CheckErrors (*eval, expected_errors);
4762 
4763     SetTech(entry, CMolInfo::eTech_htgs_3);
4764     eval = validator.Validate(seh, options);
4765     CheckErrors (*eval, expected_errors);
4766 
4767     SetTech(entry, CMolInfo::eTech_htgs_0);
4768     eval = validator.Validate(seh, options);
4769     CheckErrors (*eval, expected_errors);
4770 
4771     SetTech(entry, CMolInfo::eTech_composite_wgs_htgs);
4772     eval = validator.Validate(seh, options);
4773     CheckErrors (*eval, expected_errors);
4774 
4775     SetTech(entry, CMolInfo::eTech_wgs);
4776     eval = validator.Validate(seh, options);
4777     AddChromosomeNoLocation(expected_errors, entry);
4778     CheckErrors (*eval, expected_errors);
4779 
4780     CLEAR_ERRORS
4781 
4782     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadHTGSeq",
4783                                                  "HTGS 2 raw seq has no gaps and no graphs"));
4784     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NucleotideTechniqueOnProtein",
4785                                                  "Protein with nucleic acid sequence method"));
4786     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "HTGS_STS_GSS_WGSshouldBeGenomic",
4787                                                  "HTGS/STS/GSS/WGS sequence should be genomic"));
4788 
4789     //AddChromosomeNoLocation(expected_errors, entry);
4790     SetTech(entry, CMolInfo::eTech_htgs_2);
4791     eval = validator.Validate(seh, options);
4792     CheckErrors (*eval, expected_errors);
4793 
4794     CLEAR_ERRORS
4795 
4796     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "NoKeywordHasTechnique",
4797                                                  "Molinfo.tech barcode without BARCODE keyword"));
4798     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NucleotideTechniqueOnProtein",
4799                                                  "Protein with nucleic acid sequence method"));
4800 
4801     //AddChromosomeNoLocation(expected_errors, entry);
4802     SetTech(entry, CMolInfo::eTech_barcode);
4803     eval = validator.Validate(seh, options);
4804     CheckErrors (*eval, expected_errors);
4805 
4806     CLEAR_ERRORS
4807 }
4808 
4809 
BOOST_AUTO_TEST_CASE(Test_Descr_Unknown)4810 BOOST_AUTO_TEST_CASE(Test_Descr_Unknown)
4811 {
4812     // prepare entry
4813     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
4814     CRef<CSeqdesc> desc(new CSeqdesc());
4815     desc->SetModif().push_back(eGIBB_mod_other);
4816     entry->SetDescr().Set().push_back(desc);
4817 
4818     STANDARD_SETUP
4819 
4820     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
4821                               "Modif descriptor is obsolete"));
4822     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Unknown",
4823                               "GIBB-mod = other used"));
4824 
4825     //AddChromosomeNoLocation(expected_errors, entry);
4826     eval = validator.Validate(seh, options);
4827     CheckErrors (*eval, expected_errors);
4828 
4829     CLEAR_ERRORS
4830 }
4831 
4832 
MakeGps(CRef<CSeq_entry> member)4833 static CRef<CSeq_entry> MakeGps(CRef<CSeq_entry> member)
4834 {
4835     CRef<CSeq_entry> set(new CSeq_entry());
4836     set->SetSet().SetClass(CBioseq_set::eClass_gen_prod_set);
4837     set->SetSet().SetSeq_set().push_back(member);
4838     return set;
4839 }
4840 
4841 
BOOST_AUTO_TEST_CASE(Test_Descr_NoPubFound)4842 BOOST_AUTO_TEST_CASE(Test_Descr_NoPubFound)
4843 {
4844     // prepare entry
4845     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
4846     unit_test_util::RemoveDescriptorType (entry, CSeqdesc::e_Pub);
4847 
4848     STANDARD_SETUP
4849 
4850     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoPubFound",
4851                               "No publications anywhere on this entire record."));
4852     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "MissingPubRequirement",
4853                               "No submission citation anywhere on this entire record."));
4854     //AddChromosomeNoLocation(expected_errors, entry);
4855     eval = validator.Validate(seh, options);
4856     CheckErrors (*eval, expected_errors);
4857 
4858     CLEAR_ERRORS
4859 
4860     // make gpipe - should suppress error
4861     scope.RemoveTopLevelSeqEntry(seh);
4862     CRef<CSeq_id> id_suppress(new CSeq_id());
4863     id_suppress->SetGpipe().SetAccession("AY123456");
4864     entry->SetSet().SetSeq_set().front()->SetSeq().SetId().push_back(id_suppress);
4865     seh = scope.AddTopLevelSeqEntry(*entry);
4866     expected_errors.push_back(new CExpectedError("gpp|AY123456|", eDiag_Info, "MissingPubRequirement",
4867                               "No submission citation anywhere on this entire record."));
4868     //AddChromosomeNoLocation(expected_errors, "gpp|AY123456|");
4869     eval = validator.Validate(seh, options);
4870     CheckErrors (*eval, expected_errors);
4871 
4872     CLEAR_ERRORS
4873 
4874     // make GPS - will suppress pub errors, although introduce gps erros
4875     scope.RemoveTopLevelSeqEntry(seh);
4876     entry->SetSet().SetSeq_set().front()->SetSeq().SetId().pop_back();
4877     CRef<CSeq_entry> gps = MakeGps(entry);
4878     seh = scope.AddTopLevelSeqEntry(*gps);
4879     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning,
4880                               "GenomicProductPackagingProblem",
4881                               "Nucleotide bioseq should be product of mRNA feature on contig, but is not"));
4882     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning,
4883                               "GenomicProductPackagingProblem",
4884                               "Protein bioseq should be product of CDS feature on contig, but is not"));
4885     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "MissingPubRequirement",
4886                               "No submission citation anywhere on this entire record."));
4887 
4888     //AddChromosomeNoLocation(expected_errors, entry);
4889     eval = validator.Validate(seh, options);
4890     CheckErrors (*eval, expected_errors);
4891 
4892     CLEAR_ERRORS
4893 
4894     // only one has pub
4895     scope.RemoveTopLevelSeqEntry(seh);
4896     entry = unit_test_util::BuildGoodNucProtSet();
4897     unit_test_util::RemoveDescriptorType (entry, CSeqdesc::e_Pub);
4898     unit_test_util::AddGoodPub(entry->SetSet().SetSeq_set().front());
4899     seh = scope.AddTopLevelSeqEntry(*entry);
4900     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "NoPubFound",
4901                               "No publications refer to this Bioseq."));
4902     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Info, "MissingPubRequirement",
4903                               "Expected submission citation is missing for this Bioseq"));
4904 
4905     //AddChromosomeNoLocation(expected_errors, entry);
4906     eval = validator.Validate(seh, options);
4907     CheckErrors (*eval, expected_errors);
4908 
4909     CLEAR_ERRORS
4910 
4911     // intermediate wgs should suppress NoPubFound
4912     scope.RemoveTopLevelSeqEntry(seh);
4913     id_suppress->SetOther().SetAccession("NC_123456");
4914     entry->SetSet().SetSeq_set().front()->SetSeq().SetId().push_back(id_suppress);
4915     SetTech (entry->SetSet().SetSeq_set().front(), CMolInfo::eTech_wgs);
4916     seh = scope.AddTopLevelSeqEntry(*entry);
4917 
4918     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Info, "MissingPubRequirement",
4919         "Expected submission citation is missing for this Bioseq"));
4920     AddChromosomeNoLocation(expected_errors, "ref|NC_123456|");
4921     eval = validator.Validate(seh, options);
4922     CheckErrors (*eval, expected_errors);
4923 
4924     CLEAR_ERRORS
4925 }
4926 
4927 
BOOST_AUTO_TEST_CASE(Test_Descr_NoOrgFound)4928 BOOST_AUTO_TEST_CASE(Test_Descr_NoOrgFound)
4929 {
4930     // prepare entry
4931     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
4932     unit_test_util::RemoveDescriptorType (entry, CSeqdesc::e_Source);
4933 
4934     STANDARD_SETUP
4935 
4936     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceMissing",
4937                               "Nuc-prot set does not contain expected BioSource descriptor"));
4938     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoSourceDescriptor",
4939                               "No source information included on this record."));
4940 
4941     eval = validator.Validate(seh, options);
4942     CheckErrors (*eval, expected_errors);
4943 
4944     CLEAR_ERRORS
4945 
4946     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceMissing",
4947             "Nuc-prot set does not contain expected BioSource descriptor"));
4948 
4949     // suppress if patent or pdb
4950     scope.RemoveTopLevelSeqEntry(seh);
4951     CRef<CSeq_id> id2(new CSeq_id());
4952     id2->SetPatent().SetSeqid(1);
4953     id2->SetPatent().SetCit().SetCountry("USA");
4954     id2->SetPatent().SetCit().SetId().SetNumber("1");
4955     entry->SetSet().SetSeq_set().front()->SetSeq().SetId().push_back(id2);
4956     seh = scope.AddTopLevelSeqEntry(*entry);
4957     eval = validator.Validate(seh, options);
4958     CheckErrors (*eval, expected_errors);
4959 
4960     scope.RemoveTopLevelSeqEntry(seh);
4961     CRef<CPDB_seq_id> pdb_id(new CPDB_seq_id());
4962     pdb_id->SetMol().Set("foo");
4963     id2->SetPdb(*pdb_id);
4964     seh = scope.AddTopLevelSeqEntry(*entry);
4965     SetErrorsAccessions(expected_errors, "pdb|foo| ");
4966     eval = validator.Validate(seh, options);
4967     CheckErrors (*eval, expected_errors);
4968 
4969     // add one source
4970     scope.RemoveTopLevelSeqEntry(seh);
4971     entry->SetSet().SetSeq_set().front()->SetSeq().SetId().pop_back();
4972     unit_test_util::AddGoodSource (entry->SetSet().SetSeq_set().front());
4973     seh = scope.AddTopLevelSeqEntry(*entry);
4974     SetErrorsAccessions(expected_errors, "lcl|nuc");
4975     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Fatal, "NoOrgFound",
4976                               "No organism name included in the source. Other qualifiers may exist."));
4977     //AddChromosomeNoLocation(expected_errors, entry);
4978 
4979     eval = validator.Validate(seh, options);
4980     CheckErrors (*eval, expected_errors);
4981 
4982     CLEAR_ERRORS
4983 
4984     // if there is a source descriptor but no tax name, still produce error
4985     unit_test_util::AddGoodSource(entry->SetSet().SetSeq_set().back());
4986     unit_test_util::SetTaxname(entry->SetSet().SetSeq_set().back(), "");
4987     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Fatal, "NoOrgFound",
4988                               "No organism name included in the source. Other qualifiers may exist."));
4989     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceOnProtein",
4990                                                  "Nuc-prot set has 1 protein with a BioSource descriptor"));
4991     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceMissing",
4992                               "Nuc-prot set does not contain expected BioSource descriptor"));
4993     //AddChromosomeNoLocation(expected_errors, "lcl|nuc");
4994     //AddChromosomeNoLocation(expected_errors, "lcl|prot");
4995     eval = validator.Validate(seh, options);
4996     CheckErrors (*eval, expected_errors);
4997 
4998     CLEAR_ERRORS
4999 }
5000 
5001 
BOOST_AUTO_TEST_CASE(Test_Descr_MultipleBioSources)5002 BOOST_AUTO_TEST_CASE(Test_Descr_MultipleBioSources)
5003 {
5004     // prepare entry
5005     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5006     unit_test_util::AddGoodSource (entry);
5007 
5008     STANDARD_SETUP
5009 
5010     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MultipleBioSources",
5011                               "Undesired multiple source descriptors"));
5012 
5013     //AddChromosomeNoLocation(expected_errors, "lcl|good");
5014     //AddChromosomeNoLocation(expected_errors, "lcl|good");
5015     eval = validator.Validate(seh, options);
5016     CheckErrors (*eval, expected_errors);
5017 
5018     CLEAR_ERRORS
5019 }
5020 
5021 
BOOST_AUTO_TEST_CASE(Test_Descr_NoMolInfoFound)5022 BOOST_AUTO_TEST_CASE(Test_Descr_NoMolInfoFound)
5023 {
5024     // prepare entry
5025     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5026     unit_test_util::RemoveDescriptorType (entry, CSeqdesc::e_Molinfo);
5027 
5028     STANDARD_SETUP
5029 
5030     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoMolInfoFound",
5031                               "No Mol-info applies to this Bioseq"));
5032     //AddChromosomeNoLocation(expected_errors, entry);
5033 
5034     eval = validator.Validate(seh, options);
5035     CheckErrors (*eval, expected_errors);
5036 
5037     CLEAR_ERRORS
5038 }
5039 
5040 
BOOST_AUTO_TEST_CASE(Test_Descr_NoTaxonID)5041 BOOST_AUTO_TEST_CASE(Test_Descr_NoTaxonID)
5042 {
5043     // prepare entry
5044     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5045     unit_test_util::SetTaxon(entry, 0);
5046 
5047     STANDARD_SETUP
5048 
5049     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NoTaxonID",
5050                               "BioSource is missing taxon ID"));
5051     //AddChromosomeNoLocation(expected_errors, entry);
5052     eval = validator.Validate(seh, options);
5053     CheckErrors (*eval, expected_errors);
5054 
5055     CLEAR_ERRORS
5056 }
5057 
5058 
BOOST_AUTO_TEST_CASE(Test_Descr_InconsistentBiosources)5059 BOOST_AUTO_TEST_CASE(Test_Descr_InconsistentBiosources)
5060 {
5061     // prepare entry
5062     CRef<CSeq_entry> entry(new CSeq_entry());
5063     entry->SetSet().SetClass(CBioseq_set::eClass_pop_set);
5064     CRef<CSeq_entry> first = unit_test_util::BuildGoodSeq();
5065     entry->SetSet().SetSeq_set().push_back(first);
5066     CRef<CSeq_entry> second = unit_test_util::BuildGoodSeq();
5067     second->SetSeq().SetId().front()->SetLocal().SetStr("good2");
5068     unit_test_util::SetTaxname(second, "");
5069     unit_test_util::SetTaxon(second, 0);
5070     unit_test_util::SetTaxname(second, "Trichechus manatus latirostris");
5071     unit_test_util::SetTaxon(second, 127582);
5072     entry->SetSet().SetSeq_set().push_back(second);
5073 
5074     CRef<CSeqdesc> desc(new CSeqdesc());
5075     desc->SetTitle("popset title");
5076     entry->SetSet().SetDescr().Set().push_back(desc);
5077 
5078     STANDARD_SETUP
5079 
5080     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentTaxNameSet",
5081                               "Population set contains inconsistent organism names."));
5082     //AddChromosomeNoLocation(expected_errors, "lcl|good");
5083     //AddChromosomeNoLocation(expected_errors, "lcl|good2");
5084 
5085     eval = validator.Validate(seh, options);
5086     CheckErrors (*eval, expected_errors);
5087 
5088     // warning instead of error if same up to ' sp. '
5089     unit_test_util::SetTaxname(first, "");
5090     unit_test_util::SetTaxon(first, 0);
5091     unit_test_util::SetTaxname(first, "Corynebacterium sp. 979");
5092     unit_test_util::SetTaxon(first, 215582);
5093     unit_test_util::SetTaxname(second, "");
5094     unit_test_util::SetTaxon(second, 0);
5095     unit_test_util::SetTaxname(second, "Corynebacterium sp. DJ1");
5096     unit_test_util::SetTaxon(second, 632939);
5097     expected_errors[0]->SetSeverity(eDiag_Warning);
5098     eval = validator.Validate(seh, options);
5099     CheckErrors (*eval, expected_errors);
5100 
5101     // warning instead of error if one name is subset of the other
5102     unit_test_util::SetTaxname(first, "");
5103     unit_test_util::SetTaxon(first, 0);
5104     unit_test_util::SetTaxname(first, "Trichechus manatus");
5105     unit_test_util::SetTaxon(first, 9778);
5106     unit_test_util::SetTaxname(second, "");
5107     unit_test_util::SetTaxon(second, 0);
5108     unit_test_util::SetTaxname(second, "Trichechus manatus latirostris");
5109     unit_test_util::SetTaxon(second, 127582);
5110     eval = validator.Validate(seh, options);
5111     CheckErrors (*eval, expected_errors);
5112 
5113     CLEAR_ERRORS
5114 
5115     // no error if not pop-set
5116     unit_test_util::SetTaxname(first, "");
5117     unit_test_util::SetTaxon(first, 0);
5118     unit_test_util::SetTaxname(first, "Corynebacterium sp. 979");
5119     unit_test_util::SetTaxon(first, 215582);
5120     unit_test_util::SetTaxname(second, "");
5121     unit_test_util::SetTaxon(second, 0);
5122     unit_test_util::SetTaxname(second, "Trichechus manatus latirostris");
5123     unit_test_util::SetTaxon(second, 127582);
5124     entry->SetSet().SetClass(CBioseq_set::eClass_genbank);
5125     unit_test_util::RemoveDescriptorType(entry, CSeqdesc::e_Title);
5126     //AddChromosomeNoLocation(expected_errors, "lcl|good");
5127     //AddChromosomeNoLocation(expected_errors, "lcl|good2");
5128     eval = validator.Validate(seh, options);
5129     CheckErrors (*eval, expected_errors);
5130 
5131     CLEAR_ERRORS
5132 }
5133 
5134 
BOOST_AUTO_TEST_CASE(Test_Descr_MissingLineage)5135 BOOST_AUTO_TEST_CASE(Test_Descr_MissingLineage)
5136 {
5137    // prepare entry
5138     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5139     unit_test_util::ResetOrgname(entry);
5140 
5141     STANDARD_SETUP
5142 
5143     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingLineage",
5144                               "No lineage for this BioSource."));
5145     //AddChromosomeNoLocation(expected_errors, entry);
5146 
5147     eval = validator.Validate(seh, options);
5148     CheckErrors (*eval, expected_errors);
5149 
5150     unit_test_util::SetLineage (entry, "");
5151     eval = validator.Validate(seh, options);
5152     CheckErrors (*eval, expected_errors);
5153 
5154     // warning if EMBL
5155     scope.RemoveTopLevelSeqEntry(seh);
5156     entry->SetSeq().SetId().front()->SetEmbl().SetAccession("B12345");
5157     seh = scope.AddTopLevelSeqEntry(*entry);
5158     expected_errors[0]->SetSeverity(eDiag_Warning);
5159     ChangeErrorAcc(expected_errors, "emb|B12345|");
5160     eval = validator.Validate(seh, options);
5161     CheckErrors (*eval, expected_errors);
5162 
5163     // warning if DDBJ
5164     scope.RemoveTopLevelSeqEntry(seh);
5165     entry->SetSeq().SetId().front()->SetDdbj().SetAccession("C12345");
5166     seh = scope.AddTopLevelSeqEntry(*entry);
5167     expected_errors[0]->SetSeverity(eDiag_Warning);
5168     ChangeErrorAcc(expected_errors, "dbj|C12345|");
5169     eval = validator.Validate(seh, options);
5170     CheckErrors (*eval, expected_errors);
5171 
5172 
5173     // critical instead of error if refseq AND has taxon
5174     scope.RemoveTopLevelSeqEntry(seh);
5175     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
5176     seh = scope.AddTopLevelSeqEntry(*entry);
5177     expected_errors[0]->SetSeverity(eDiag_Critical);
5178     ChangeErrorAcc(expected_errors, "ref|NC_123456|");
5179     eval = validator.Validate(seh, options);
5180     CheckErrors (*eval, expected_errors);
5181 
5182     // back to error if no taxon but refseq
5183     unit_test_util::SetTaxon (entry, 0);
5184     expected_errors[0]->SetSeverity(eDiag_Error);
5185     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "NoTaxonID",
5186         "BioSource is missing taxon ID"));
5187     eval = validator.Validate(seh, options);
5188     CheckErrors (*eval, expected_errors);
5189 
5190     CLEAR_ERRORS
5191 }
5192 
5193 
BOOST_AUTO_TEST_CASE(Test_Descr_SerialInComment)5194 BOOST_AUTO_TEST_CASE(Test_Descr_SerialInComment)
5195 {
5196     // prepare entry
5197     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5198     CRef<CSeqdesc> comment(new CSeqdesc());
5199     comment->SetComment("blah blah [123456]");
5200     entry->SetSeq().SetDescr().Set().push_back(comment);
5201 
5202     STANDARD_SETUP
5203 
5204     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "SerialInComment",
5205                               "Comment may refer to reference by serial number - attach reference specific comments to the reference REMARK instead."));
5206     //AddChromosomeNoLocation(expected_errors, entry);
5207 
5208     eval = validator.Validate(seh, options);
5209     CheckErrors (*eval, expected_errors);
5210 
5211     CLEAR_ERRORS
5212 }
5213 
5214 
BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceNeedsFocus)5215 BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceNeedsFocus)
5216 {
5217     // prepare entry
5218     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5219     unit_test_util::AddGoodSourceFeature (entry);
5220 
5221     STANDARD_SETUP
5222 
5223     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BioSourceNeedsFocus",
5224                               "BioSource descriptor must have focus or transgenic when BioSource feature with different taxname is present."));
5225     //AddChromosomeNoLocation(expected_errors, entry);
5226 
5227     eval = validator.Validate(seh, options);
5228     CheckErrors (*eval, expected_errors);
5229 
5230     CLEAR_ERRORS
5231 
5232     //AddChromosomeNoLocation(expected_errors, entry);
5233 
5234     // error goes away if focus is set on descriptor
5235     unit_test_util::SetFocus(entry);
5236     eval = validator.Validate(seh, options);
5237     CheckErrors (*eval, expected_errors);
5238 
5239     // error goes away if descriptor is transgenic
5240     unit_test_util::ClearFocus(entry);
5241     unit_test_util::SetTransgenic (entry, true);
5242     eval = validator.Validate(seh, options);
5243     CheckErrors (*eval, expected_errors);
5244 
5245     CLEAR_ERRORS
5246 }
5247 
5248 
BOOST_AUTO_TEST_CASE(Test_Descr_BadOrganelle)5249 BOOST_AUTO_TEST_CASE(Test_Descr_BadOrganelle)
5250 {
5251     // prepare entry
5252     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5253     unit_test_util::SetGenome (entry, CBioSource::eGenome_kinetoplast);
5254 
5255     STANDARD_SETUP
5256 
5257     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadOrganelleLocation",
5258                               "Only Kinetoplastida have kinetoplasts"));
5259     //AddChromosomeNoLocation(expected_errors, entry);
5260 
5261     eval = validator.Validate(seh, options);
5262     CheckErrors (*eval, expected_errors);
5263 
5264     unit_test_util::SetGenome (entry, CBioSource::eGenome_nucleomorph);
5265     expected_errors[0]->SetErrMsg("Only Chlorarachniophyceae and Cryptophyta have nucleomorphs");
5266     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TaxonomyNucleomorphProblem",
5267                                                  "Taxonomy lookup does not have expected nucleomorph flag"));
5268     eval = validator.Validate(seh, options);
5269     CheckErrors (*eval, expected_errors);
5270 
5271     CLEAR_ERRORS
5272     unit_test_util::SetGenome (entry, CBioSource::eGenome_macronuclear);
5273     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadOrganelleLocation",
5274                               "Only Ciliophora have macronuclear locations"));
5275     //AddChromosomeNoLocation(expected_errors, entry);
5276     eval = validator.Validate(seh, options);
5277     CheckErrors (*eval, expected_errors);
5278 
5279     CLEAR_ERRORS
5280 
5281     unit_test_util::SetDrosophila_melanogaster(entry);
5282     unit_test_util::SetGenome (entry, CBioSource::eGenome_plastid);
5283     //AddChromosomeNoLocation(expected_errors, entry);
5284     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TaxonomyPlastidsProblem",
5285                               "Taxonomy lookup does not have expected plastid flag"));
5286     eval = validator.Validate(seh, options);
5287     CheckErrors (*eval, expected_errors);
5288 
5289     CLEAR_ERRORS
5290 
5291     // no plastid error if flag is present
5292     unit_test_util::SetSebaea_microphylla(entry);
5293     eval = validator.Validate(seh, options);
5294     //AddChromosomeNoLocation(expected_errors, entry);
5295     CheckErrors (*eval, expected_errors);
5296 
5297     CLEAR_ERRORS
5298 }
5299 
5300 
BOOST_AUTO_TEST_CASE(Test_Descr_MultipleChromosomes)5301 BOOST_AUTO_TEST_CASE(Test_Descr_MultipleChromosomes)
5302 {
5303     // prepare entry
5304     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5305     unit_test_util::SetChromosome (entry, "1");
5306 
5307     STANDARD_SETUP
5308 
5309     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleSourceQualifiers",
5310                               "Multiple identical chromosome qualifiers present"));
5311     //AddChromosomeNoLocation(expected_errors, entry);
5312 
5313     eval = validator.Validate(seh, options);
5314     CheckErrors (*eval, expected_errors);
5315 
5316     unit_test_util::SetChromosome (entry, "2");
5317     expected_errors[0]->SetErrMsg("Multiple conflicting chromosome qualifiers present");
5318     eval = validator.Validate(seh, options);
5319     CheckErrors (*eval, expected_errors);
5320 
5321     CLEAR_ERRORS
5322 }
5323 
5324 
BOOST_AUTO_TEST_CASE(Test_Descr_BadSubSource)5325 BOOST_AUTO_TEST_CASE(Test_Descr_BadSubSource)
5326 {
5327     // prepare entry
5328     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5329     unit_test_util::SetSubSource (entry, 0, "foo");
5330 
5331     STANDARD_SETUP
5332 
5333     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "BadSubSource",
5334                               "Unknown subsource subtype 0"));
5335     //AddChromosomeNoLocation(expected_errors, entry);
5336 
5337     eval = validator.Validate(seh, options);
5338     CheckErrors (*eval, expected_errors);
5339 
5340     CLEAR_ERRORS
5341 }
5342 
ShowOrgRef(const COrg_ref & org)5343 void ShowOrgRef(const COrg_ref& org)
5344 {
5345     ESerialDataFormat outFormat = eSerial_AsnText;
5346     auto_ptr<CObjectOStream> os;
5347     os.reset(CObjectOStream::Open(outFormat, cout));
5348     *os << org;
5349 }
5350 
5351 
ShowOrgRef(const CSeq_entry & entry)5352 void ShowOrgRef(const CSeq_entry& entry)
5353 {
5354     if (entry.IsSeq()) {
5355         if (entry.GetSeq().IsSetDescr()) {
5356             ITERATE(objects::CSeq_descr::Tdata, it, entry.GetSeq().GetDescr().Get()) {
5357                 if ((*it)->IsSource() && (*it)->GetSource().IsSetOrg()) {
5358                     ShowOrgRef((*it)->GetSource().GetOrg());
5359                 }
5360             }
5361         }
5362     } else if (entry.IsSet()) {
5363         if (entry.GetSet().IsSetDescr()) {
5364             ITERATE(objects::CSeq_descr::Tdata, it, entry.GetSet().GetDescr().Get()) {
5365                 if ((*it)->IsSource() && (*it)->GetSource().IsSetOrg()) {
5366                     ShowOrgRef((*it)->GetSource().GetOrg());
5367                 }
5368             }
5369         }
5370         if (entry.GetSet().IsSetSeq_set()) {
5371             ITERATE(objects::CBioseq_set::TSeq_set, it, entry.GetSet().GetSeq_set()) {
5372                 ShowOrgRef(**it);
5373             }
5374         }
5375     }
5376 }
5377 
5378 
BOOST_AUTO_TEST_CASE(Test_Descr_BadOrgMod)5379 BOOST_AUTO_TEST_CASE(Test_Descr_BadOrgMod)
5380 {
5381     // prepare entry
5382     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5383     unit_test_util::SetOrgMod (entry, 0, "foo");
5384     unit_test_util::SetOrgMod (entry, 1, "bar");
5385     unit_test_util::SetOrgMod (entry, COrgMod::eSubtype_strain, "a");
5386     unit_test_util::SetOrgMod (entry, COrgMod::eSubtype_strain, "b");
5387     unit_test_util::SetOrgMod (entry, COrgMod::eSubtype_variety, "c");
5388     unit_test_util::SetOrgMod (entry, COrgMod::eSubtype_nat_host, "Sebaea microphylla");
5389     unit_test_util::SetCommon (entry, "some common name");
5390     unit_test_util::SetOrgMod (entry, COrgMod::eSubtype_common, "some common name");
5391     unit_test_util::SetOrgMod (entry, COrgMod::eSubtype_type_material, "invalid type material name");
5392 //    ShowOrgRef(*entry);
5393 
5394     STANDARD_SETUP
5395 
5396     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
5397                               "OrganismNotFound", "Organism not found in taxonomy database (suggested:Sebaea microphylla var. c)"));
5398 
5399     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "BadOrgMod",
5400                               "Unknown orgmod subtype 0"));
5401     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "BadOrgMod",
5402                               "Unknown orgmod subtype 1"));
5403     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleStrains",
5404                               "Multiple strain qualifiers on the same BioSource"));
5405     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadTypeMaterial",
5406                               "Bad value for type_material"));
5407     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrgModMissingValue",
5408                               "Variety value specified is not found in taxname"));
5409     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HostIdenticalToOrganism",
5410                               "Specific host is identical to taxname"));
5411     /*
5412     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadOrgMod",
5413                               "OrgMod common is identical to Org-ref common"));
5414     */
5415     //AddChromosomeNoLocation(expected_errors, entry);
5416 
5417     eval = validator.Validate(seh, options);
5418     CheckErrors (*eval, expected_errors);
5419 
5420     CLEAR_ERRORS
5421 }
5422 
5423 
BOOST_AUTO_TEST_CASE(Test_BadVariety)5424 BOOST_AUTO_TEST_CASE(Test_BadVariety)
5425 {
5426     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5427     SetOrgMod(entry, COrgMod::eSubtype_variety, "x");
5428     SetTaxname(entry, "Sebaea microphylla var. x");
5429     SetTaxon(entry, 0);
5430 
5431     STANDARD_SETUP
5432 
5433     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
5434         "BadVariety",
5435         "Orgmod variety should only be in plants, fungi, or cyanobacteria"));
5436     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
5437         "NoTaxonID", "BioSource is missing taxon ID"));
5438     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
5439         "OrganismNotFound", "Organism not found in taxonomy database"));
5440     //AddChromosomeNoLocation(expected_errors, entry);
5441     eval = validator.Validate(seh, options);
5442     CheckErrors (*eval, expected_errors);
5443 
5444     CLEAR_ERRORS
5445 }
5446 
5447 
BOOST_AUTO_TEST_CASE(Test_Descr_InconsistentProteinTitle)5448 BOOST_AUTO_TEST_CASE(Test_Descr_InconsistentProteinTitle)
5449 {
5450     // prepare entry
5451     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
5452     CRef<CSeqdesc> desc(new CSeqdesc());
5453     desc->SetTitle("Not the correct title");
5454     entry->SetSet().SetSeq_set().back()->SetSeq().SetDescr().Set().push_back(desc);
5455 
5456     STANDARD_SETUP
5457 
5458     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "InconsistentProteinTitle",
5459                               "Instantiated protein title does not match automatically generated title"));
5460     //AddChromosomeNoLocation(expected_errors, entry);
5461 
5462     eval = validator.Validate(seh, options);
5463     CheckErrors (*eval, expected_errors);
5464 
5465     CLEAR_ERRORS
5466 }
5467 
5468 
BOOST_FIXTURE_TEST_CASE(Test_Descr_Inconsistent,CGenBankFixture)5469 BOOST_FIXTURE_TEST_CASE(Test_Descr_Inconsistent, CGenBankFixture)
5470 {
5471     // prepare entry
5472     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5473     CRef<CSeqdesc> desc1(new CSeqdesc());
5474     desc1->SetMol_type(eGIBB_mol_genomic);
5475     entry->SetSeq().SetDescr().Set().push_back(desc1);
5476     CRef<CSeqdesc> desc2(new CSeqdesc());
5477     desc2->SetMol_type(eGIBB_mol_pre_mRNA);
5478     entry->SetSeq().SetDescr().Set().push_back(desc2);
5479     CRef<CSeqdesc> desc3(new CSeqdesc());
5480     desc3->SetModif().push_back(eGIBB_mod_dna);
5481     desc3->SetModif().push_back(eGIBB_mod_rna);
5482     desc3->SetModif().push_back(eGIBB_mod_mitochondrial);
5483     desc3->SetModif().push_back(eGIBB_mod_cyanelle);
5484     desc3->SetModif().push_back(eGIBB_mod_complete);
5485     desc3->SetModif().push_back(eGIBB_mod_partial);
5486     desc3->SetModif().push_back(eGIBB_mod_no_left);
5487     desc3->SetModif().push_back(eGIBB_mod_no_right);
5488     entry->SetSeq().SetDescr().Set().push_back(desc3);
5489 
5490     CRef<CSeqdesc> desc_gb1(new CSeqdesc());
5491     desc_gb1->SetGenbank().SetKeywords().push_back("TPA:experimental");
5492     desc_gb1->SetGenbank().SetKeywords().push_back("TPA:inferential");
5493     entry->SetSeq().SetDescr().Set().push_back(desc_gb1);
5494     CRef<CSeqdesc> desc_gb2(new CSeqdesc());
5495     desc_gb2->SetGenbank();
5496     entry->SetSeq().SetDescr().Set().push_back(desc_gb2);
5497 
5498     CRef<CSeqdesc> desc_embl1(new CSeqdesc());
5499     desc_embl1->SetEmbl();
5500     entry->SetSeq().SetDescr().Set().push_back(desc_embl1);
5501     CRef<CSeqdesc> desc_embl2(new CSeqdesc());
5502     desc_embl2->SetEmbl();
5503     entry->SetSeq().SetDescr().Set().push_back(desc_embl2);
5504 
5505     CRef<CSeqdesc> desc_pir1(new CSeqdesc());
5506     desc_pir1->SetPir();
5507     entry->SetSeq().SetDescr().Set().push_back(desc_pir1);
5508     CRef<CSeqdesc> desc_pir2(new CSeqdesc());
5509     desc_pir2->SetPir();
5510     entry->SetSeq().SetDescr().Set().push_back(desc_pir2);
5511 
5512     CRef<CSeqdesc> desc_sp1(new CSeqdesc());
5513     desc_sp1->SetSp();
5514     entry->SetSeq().SetDescr().Set().push_back(desc_sp1);
5515     CRef<CSeqdesc> desc_sp2(new CSeqdesc());
5516     desc_sp2->SetSp();
5517     entry->SetSeq().SetDescr().Set().push_back(desc_sp2);
5518 
5519     CRef<CSeqdesc> desc_pdb1(new CSeqdesc());
5520     desc_pdb1->SetPdb();
5521     entry->SetSeq().SetDescr().Set().push_back(desc_pdb1);
5522     CRef<CSeqdesc> desc_pdb2(new CSeqdesc());
5523     desc_pdb2->SetPdb();
5524     entry->SetSeq().SetDescr().Set().push_back(desc_pdb2);
5525 
5526     CRef<CSeqdesc> desc_prf1(new CSeqdesc());
5527     desc_prf1->SetPrf();
5528     entry->SetSeq().SetDescr().Set().push_back(desc_prf1);
5529     CRef<CSeqdesc> desc_prf2(new CSeqdesc());
5530     desc_prf2->SetPrf();
5531     entry->SetSeq().SetDescr().Set().push_back(desc_prf2);
5532 
5533     CRef<CSeqdesc> desc_create1(new CSeqdesc());
5534     desc_create1->SetCreate_date().SetStd().SetYear(2009);
5535     desc_create1->SetCreate_date().SetStd().SetMonth(4);
5536     entry->SetSeq().SetDescr().Set().push_back(desc_create1);
5537     CRef<CSeqdesc> desc_create2(new CSeqdesc());
5538     desc_create2->SetCreate_date().SetStd().SetYear(2009);
5539     desc_create2->SetCreate_date().SetStd().SetMonth(3);
5540     entry->SetSeq().SetDescr().Set().push_back(desc_create2);
5541     CRef<CSeqdesc> desc_update(new CSeqdesc());
5542     desc_update->SetUpdate_date().SetStd().SetYear(2009);
5543     desc_update->SetUpdate_date().SetStd().SetMonth(2);
5544     entry->SetSeq().SetDescr().Set().push_back(desc_update);
5545 
5546     CRef<CSeqdesc> src_desc(new CSeqdesc());
5547     src_desc->SetSource().SetOrg().SetTaxname("Trichechus manatus");
5548     unit_test_util::SetTaxon (src_desc->SetSource(), 9778);
5549     src_desc->SetSource().SetOrg().SetOrgname().SetLineage("some lineage");
5550     entry->SetSeq().SetDescr().Set().push_back(src_desc);
5551 
5552     SetTech(entry, CMolInfo::eTech_genemap);
5553     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_no_left);
5554     CRef<CSeqdesc> m_desc(new CSeqdesc());
5555     m_desc->SetMolinfo().SetBiomol(CMolInfo::eBiomol_cRNA);
5556     m_desc->SetMolinfo().SetTech(CMolInfo::eTech_fli_cdna);
5557     m_desc->SetMolinfo().SetCompleteness(CMolInfo::eCompleteness_no_right);
5558     entry->SetSeq().SetDescr().Set().push_back(m_desc);
5559 
5560     STANDARD_SETUP
5561 
5562     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentTPA",
5563                               "TPA:experimental and TPA:inferential should not both be in the same set of keywords"));
5564     /*
5565     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentDates",
5566                               "Inconsistent create_dates [Mar 2009] and [Apr 2009]"));
5567     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentDates",
5568                               "Inconsistent create_date [Apr 2009] and update_date [Feb 2009]"));
5569     */
5570     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentTaxName",
5571                               "Inconsistent organism names [Trichechus manatus] and [Sebaea microphylla]"));
5572     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolInfo",
5573                               "Inconsistent Molinfo-biomol [1] and [11]"));
5574     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolInfoTechnique",
5575                               "Inconsistent Molinfo-tech [5] and [17]"));
5576     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentMolInfo",
5577                               "Inconsistent Molinfo-completeness [3] and [4]"));
5578     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InconsistentGenBankblocks",
5579                               "Multiple GenBank blocks"));
5580     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5581                               "Multiple EMBL blocks"));
5582     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5583                               "Multiple PIR blocks"));
5584     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5585                               "Multiple PDB blocks"));
5586     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5587                               "Multiple PRF blocks"));
5588     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5589                               "Multiple SWISS-PROT blocks"));
5590     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5591                               "Inconsistent GIBB-mod [0] and [1]"));
5592     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5593                               "Inconsistent GIBB-mod [4] and [7]"));
5594     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5595                               "Inconsistent GIBB-mod [11] and [10]"));
5596     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5597                               "Inconsistent GIBB-mod [11] and [16]"));
5598     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5599                               "Inconsistent GIBB-mod [11] and [17]"));
5600     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "Inconsistent",
5601                               "Inconsistent GIBB-mol [1] and [2]"));
5602     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
5603                               "MolType descriptor is obsolete"));
5604     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
5605                               "MolType descriptor is obsolete"));
5606     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
5607                               "Modif descriptor is obsolete"));
5608     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDate",
5609                               "Create date has error - BAD_DAY"));
5610     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDate",
5611                               "Create date has error - BAD_DAY"));
5612     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDate",
5613                               "Update date has error - BAD_DAY"));
5614     /*
5615     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MultipleBioSources",
5616                               "Undesired multiple source descriptors"));
5617     */
5618     //AddChromosomeNoLocation(expected_errors, entry);
5619 
5620     eval = validator.Validate(seh, options);
5621     CheckErrors (*eval, expected_errors);
5622 
5623     CLEAR_ERRORS
5624 
5625     // try different WGS-style accessions, check for wgs_tech
5626     scope.RemoveTopLevelSeqEntry(seh);
5627     entry = unit_test_util::BuildGoodSeq();
5628     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("ABCD12345678");
5629     seh = scope.AddTopLevelSeqEntry(*entry);
5630 
5631     expected_errors.push_back(new CExpectedError("gb|ABCD12345678|", eDiag_Error, "InconsistentMolInfoTechnique",
5632                               "WGS accession should have Mol-info.tech of wgs"));
5633     //AddChromosomeNoLocation(expected_errors, entry);
5634     eval = validator.Validate(seh, options);
5635     CheckErrors (*eval, expected_errors);
5636     scope.RemoveTopLevelSeqEntry(seh);
5637     entry->SetSeq().SetId().front()->SetEmbl().SetAccession("ABCE12345678");
5638     ChangeErrorAcc(expected_errors, "emb|ABCE12345678|");
5639     seh = scope.AddTopLevelSeqEntry(*entry);
5640     eval = validator.Validate(seh, options);
5641     CheckErrors (*eval, expected_errors);
5642     scope.RemoveTopLevelSeqEntry(seh);
5643     entry->SetSeq().SetId().front()->SetDdbj().SetAccession("ABCF12345678");
5644     ChangeErrorAcc(expected_errors, "dbj|ABCF12345678|");
5645     seh = scope.AddTopLevelSeqEntry(*entry);
5646     eval = validator.Validate(seh, options);
5647     CheckErrors (*eval, expected_errors);
5648 
5649     CLEAR_ERRORS
5650 
5651     // look for correct accession if WGS tech present
5652     scope.RemoveTopLevelSeqEntry(seh);
5653     entry->SetSeq().SetId().front()->SetEmbl().SetAccession("AA123456");
5654     //AddChromosomeNoLocation(expected_errors, entry);
5655     seh = scope.AddTopLevelSeqEntry(*entry);
5656     eval = validator.Validate(seh, options);
5657     CheckErrors (*eval, expected_errors);
5658 
5659     CLEAR_ERRORS
5660 
5661     scope.RemoveTopLevelSeqEntry(seh);
5662     entry->SetSeq().SetId().front()->SetDdbj().SetAccession("AB123456");
5663     //AddChromosomeNoLocation(expected_errors, entry);
5664     seh = scope.AddTopLevelSeqEntry(*entry);
5665     eval = validator.Validate(seh, options);
5666     CheckErrors (*eval, expected_errors);
5667 
5668     CLEAR_ERRORS
5669 
5670     scope.RemoveTopLevelSeqEntry(seh);
5671     entry = unit_test_util::BuildGoodSeq();
5672     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AC123456");
5673 
5674     SetTech(entry, CMolInfo::eTech_wgs);
5675     seh = scope.AddTopLevelSeqEntry(*entry);
5676     expected_errors.push_back(new CExpectedError("gb|AC123456|", eDiag_Error, "InconsistentWGSFlags",
5677                               "Mol-info.tech of wgs should have WGS accession"));
5678     expected_errors.push_back(new CExpectedError("gb|AC123456|", eDiag_Warning, "UnexpectedIdentifierChange",
5679         "Loss of general ID (BCMHGSC: PROJECT_GXOU.BAYLOR) on gi (25008031) compared to the NCBI sequence repository"));
5680     AddChromosomeNoLocation(expected_errors, entry);
5681     eval = validator.Validate(seh, options);
5682     CheckErrors (*eval, expected_errors);
5683 
5684     CLEAR_ERRORS
5685 
5686     scope.RemoveTopLevelSeqEntry(seh);
5687     entry->SetSeq().SetId().front()->SetOther().SetAccession("NM_123456");
5688     seh = scope.AddTopLevelSeqEntry(*entry);
5689     expected_errors.push_back(new CExpectedError("ref|NM_123456|", eDiag_Error, "InconsistentWGSFlags",
5690         "Mol-info.tech of wgs should have WGS accession"));
5691     AddChromosomeNoLocation(expected_errors, entry);
5692 
5693     eval = validator.Validate(seh, options);
5694     CheckErrors (*eval, expected_errors);
5695 
5696     scope.RemoveTopLevelSeqEntry(seh);
5697     entry->SetSeq().SetId().front()->SetOther().SetAccession("NP_123456");
5698     seh = scope.AddTopLevelSeqEntry(*entry);
5699     ChangeErrorAcc(expected_errors, "ref|NP_123456|");
5700     eval = validator.Validate(seh, options);
5701     CheckErrors (*eval, expected_errors);
5702 
5703     scope.RemoveTopLevelSeqEntry(seh);
5704     entry->SetSeq().SetId().front()->SetOther().SetAccession("NG_123456");
5705     seh = scope.AddTopLevelSeqEntry(*entry);
5706     ChangeErrorAcc(expected_errors, "ref|NG_123456|");
5707     eval = validator.Validate(seh, options);
5708     CheckErrors (*eval, expected_errors);
5709 
5710     scope.RemoveTopLevelSeqEntry(seh);
5711     entry->SetSeq().SetId().front()->SetOther().SetAccession("NR_123456");
5712     seh = scope.AddTopLevelSeqEntry(*entry);
5713     ChangeErrorAcc(expected_errors, "ref|NR_123456|");
5714     eval = validator.Validate(seh, options);
5715     CheckErrors (*eval, expected_errors);
5716 
5717     CLEAR_ERRORS
5718 
5719     // no tech warning if other but not one of four starts
5720     scope.RemoveTopLevelSeqEntry(seh);
5721     entry->SetSeq().SetId().front()->SetOther().SetAccession("NX_123456");
5722     seh = scope.AddTopLevelSeqEntry(*entry);
5723     eval = validator.Validate(seh, options);
5724     AddChromosomeNoLocation(expected_errors, entry);
5725     CheckErrors (*eval, expected_errors);
5726 
5727     CLEAR_ERRORS
5728 
5729     // skip warning if segset accession
5730     vector<string> segset_accession_prefixes;
5731     segset_accession_prefixes.push_back("AH");
5732     segset_accession_prefixes.push_back("CH");
5733     segset_accession_prefixes.push_back("CM");
5734     segset_accession_prefixes.push_back("DS");
5735     segset_accession_prefixes.push_back("EM");
5736     segset_accession_prefixes.push_back("EN");
5737     segset_accession_prefixes.push_back("EP");
5738     segset_accession_prefixes.push_back("EQ");
5739     segset_accession_prefixes.push_back("FA");
5740     segset_accession_prefixes.push_back("GG");
5741     segset_accession_prefixes.push_back("GL");
5742 
5743     for (vector<string>::iterator it = segset_accession_prefixes.begin();
5744          it != segset_accession_prefixes.end();
5745          ++it) {
5746         scope.RemoveTopLevelSeqEntry(seh);
5747         entry->SetSeq().SetId().front()->SetOther().SetAccession(*it + "_123456");
5748         seh = scope.AddTopLevelSeqEntry(*entry);
5749         eval = validator.Validate(seh, options);
5750         AddChromosomeNoLocation(expected_errors, entry);
5751         CheckErrors (*eval, expected_errors);
5752         CLEAR_ERRORS
5753     }
5754 
5755     // biomol on NC should be genomic or cRNA
5756     scope.RemoveTopLevelSeqEntry(seh);
5757     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
5758     SetTech(entry, CMolInfo::eTech_unknown);
5759     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_genomic);
5760     seh = scope.AddTopLevelSeqEntry(*entry);
5761     // no error expected
5762     eval = validator.Validate(seh, options);
5763     //AddChromosomeNoLocation(expected_errors, entry);
5764     CheckErrors (*eval, expected_errors);
5765     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_cRNA);
5766     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
5767     // no error expected
5768     eval = validator.Validate(seh, options);
5769     CheckErrors (*eval, expected_errors);
5770     // expect errors
5771     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_genomic_mRNA);
5772     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "InconsistentRefSeqMoltype",
5773                               "genomic RefSeq accession should use genomic or cRNA moltype"));
5774     eval = validator.Validate(seh, options);
5775     CheckErrors (*eval, expected_errors);
5776     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_mRNA);
5777     eval = validator.Validate(seh, options);
5778     CheckErrors (*eval, expected_errors);
5779     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_ncRNA);
5780     eval = validator.Validate(seh, options);
5781     CheckErrors (*eval, expected_errors);
5782     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_pre_RNA);
5783     eval = validator.Validate(seh, options);
5784     CheckErrors (*eval, expected_errors);
5785     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_rRNA);
5786     eval = validator.Validate(seh, options);
5787     CheckErrors (*eval, expected_errors);
5788     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_rRNA);
5789     eval = validator.Validate(seh, options);
5790     CheckErrors (*eval, expected_errors);
5791     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_scRNA);
5792     eval = validator.Validate(seh, options);
5793     CheckErrors (*eval, expected_errors);
5794     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_snoRNA);
5795     eval = validator.Validate(seh, options);
5796     CheckErrors (*eval, expected_errors);
5797     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_snRNA);
5798     eval = validator.Validate(seh, options);
5799     CheckErrors (*eval, expected_errors);
5800     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_tmRNA);
5801     eval = validator.Validate(seh, options);
5802     CheckErrors (*eval, expected_errors);
5803     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_transcribed_RNA);
5804     eval = validator.Validate(seh, options);
5805     CheckErrors (*eval, expected_errors);
5806     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_tRNA);
5807     eval = validator.Validate(seh, options);
5808     CheckErrors (*eval, expected_errors);
5809 
5810     CLEAR_ERRORS
5811 }
5812 
5813 
BOOST_AUTO_TEST_CASE(Test_Descr_ObsoleteSourceLocation)5814 BOOST_AUTO_TEST_CASE(Test_Descr_ObsoleteSourceLocation)
5815 {
5816     // prepare entry
5817     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5818     unit_test_util::SetGenome (entry, CBioSource::eGenome_transposon);
5819 
5820     STANDARD_SETUP
5821 
5822     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ObsoleteSourceLocation",
5823                               "Transposon and insertion sequence are no longer legal locations"));
5824     //AddChromosomeNoLocation(expected_errors, entry);
5825     eval = validator.Validate(seh, options);
5826     CheckErrors (*eval, expected_errors);
5827 
5828     unit_test_util::SetGenome (entry, CBioSource::eGenome_insertion_seq);
5829     eval = validator.Validate(seh, options);
5830     CheckErrors (*eval, expected_errors);
5831 
5832     CLEAR_ERRORS
5833 }
5834 
5835 
BOOST_AUTO_TEST_CASE(Test_Descr_ObsoleteSourceQual)5836 BOOST_AUTO_TEST_CASE(Test_Descr_ObsoleteSourceQual)
5837 {
5838     // prepare entry
5839     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5840     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_transposon_name, "a");
5841     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_insertion_seq_name, "b");
5842 
5843     STANDARD_SETUP
5844 
5845     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ObsoleteSourceQual",
5846                               "Transposon name and insertion sequence name are no longer legal qualifiers"));
5847     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ObsoleteSourceQual",
5848                               "Transposon name and insertion sequence name are no longer legal qualifiers"));
5849     //AddChromosomeNoLocation(expected_errors, entry);
5850 
5851     eval = validator.Validate(seh, options);
5852     CheckErrors (*eval, expected_errors);
5853 
5854     CLEAR_ERRORS
5855 }
5856 
5857 
BOOST_AUTO_TEST_CASE(Test_Descr_StructuredSourceNote)5858 BOOST_AUTO_TEST_CASE(Test_Descr_StructuredSourceNote)
5859 {
5860     // prepare entry
5861     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5862 
5863     STANDARD_SETUP
5864 
5865     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "StructuredSourceNote",
5866                               "Source note has structured tag '"));
5867     //AddChromosomeNoLocation(expected_errors, entry);
5868 
5869     vector<string> tag_prefixes;
5870     tag_prefixes.push_back("acronym:");
5871     tag_prefixes.push_back("anamorph:");
5872     tag_prefixes.push_back("authority:");
5873     tag_prefixes.push_back("biotype:");
5874     tag_prefixes.push_back("biovar:");
5875     tag_prefixes.push_back("bio_material:");
5876     tag_prefixes.push_back("breed:");
5877     tag_prefixes.push_back("cell_line:");
5878     tag_prefixes.push_back("cell_type:");
5879     tag_prefixes.push_back("chemovar:");
5880     tag_prefixes.push_back("chromosome:");
5881     tag_prefixes.push_back("clone:");
5882     tag_prefixes.push_back("clone_lib:");
5883     tag_prefixes.push_back("collected_by:");
5884     tag_prefixes.push_back("collection_date:");
5885     tag_prefixes.push_back("common:");
5886     tag_prefixes.push_back("country:");
5887     tag_prefixes.push_back("cultivar:");
5888     tag_prefixes.push_back("culture_collection:");
5889     tag_prefixes.push_back("dev_stage:");
5890     tag_prefixes.push_back("dosage:");
5891     tag_prefixes.push_back("ecotype:");
5892     tag_prefixes.push_back("endogenous_virus_name:");
5893     tag_prefixes.push_back("environmental_sample:");
5894     tag_prefixes.push_back("forma:");
5895     tag_prefixes.push_back("forma_specialis:");
5896     tag_prefixes.push_back("frequency:");
5897     tag_prefixes.push_back("fwd_pcr_primer_name");
5898     tag_prefixes.push_back("fwd_pcr_primer_seq");
5899     tag_prefixes.push_back("fwd_primer_name");
5900     tag_prefixes.push_back("fwd_primer_seq");
5901     tag_prefixes.push_back("genotype:");
5902     tag_prefixes.push_back("germline:");
5903     tag_prefixes.push_back("group:");
5904     tag_prefixes.push_back("haplogroup:");
5905     tag_prefixes.push_back("haplotype:");
5906     tag_prefixes.push_back("identified_by:");
5907     tag_prefixes.push_back("insertion_seq_name:");
5908     tag_prefixes.push_back("isolate:");
5909     tag_prefixes.push_back("isolation_source:");
5910     tag_prefixes.push_back("lab_host:");
5911     tag_prefixes.push_back("lat_lon:");
5912     tag_prefixes.push_back("left_primer:");
5913     tag_prefixes.push_back("linkage_group:");
5914     tag_prefixes.push_back("map:");
5915     tag_prefixes.push_back("mating_type:");
5916     tag_prefixes.push_back("metagenome_source:");
5917     tag_prefixes.push_back("metagenomic:");
5918     tag_prefixes.push_back("nat_host:");
5919     tag_prefixes.push_back("pathovar:");
5920     tag_prefixes.push_back("placement:");
5921     tag_prefixes.push_back("plasmid_name:");
5922     tag_prefixes.push_back("plastid_name:");
5923     tag_prefixes.push_back("pop_variant:");
5924     tag_prefixes.push_back("rearranged:");
5925     tag_prefixes.push_back("rev_pcr_primer_name");
5926     tag_prefixes.push_back("rev_pcr_primer_seq");
5927     tag_prefixes.push_back("rev_primer_name");
5928     tag_prefixes.push_back("rev_primer_seq");
5929     tag_prefixes.push_back("right_primer:");
5930     tag_prefixes.push_back("segment:");
5931     tag_prefixes.push_back("serogroup:");
5932     tag_prefixes.push_back("serotype:");
5933     tag_prefixes.push_back("serovar:");
5934     tag_prefixes.push_back("sex:");
5935     tag_prefixes.push_back("specimen_voucher:");
5936     tag_prefixes.push_back("strain:");
5937     tag_prefixes.push_back("subclone:");
5938     tag_prefixes.push_back("subgroup:");
5939     tag_prefixes.push_back("substrain:");
5940     tag_prefixes.push_back("subtype:");
5941     tag_prefixes.push_back("sub_species:");
5942     tag_prefixes.push_back("synonym:");
5943     tag_prefixes.push_back("taxon:");
5944     tag_prefixes.push_back("teleomorph:");
5945     tag_prefixes.push_back("tissue_lib:");
5946     tag_prefixes.push_back("tissue_type:");
5947     tag_prefixes.push_back("transgenic:");
5948     tag_prefixes.push_back("transposon_name:");
5949     tag_prefixes.push_back("type:");
5950     tag_prefixes.push_back("variety:");
5951 
5952     for (vector<string>::iterator it = tag_prefixes.begin();
5953          it != tag_prefixes.end();
5954          ++it) {
5955         expected_errors[0]->SetErrMsg("Source note has structured tag '" + *it + "'");
5956         unit_test_util::SetSubSource(entry, CSubSource::eSubtype_other, *it + "a");
5957         eval = validator.Validate(seh, options);
5958         CheckErrors (*eval, expected_errors);
5959         unit_test_util::SetSubSource(entry, CSubSource::eSubtype_other, "");
5960         unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_other, *it + "a");
5961         eval = validator.Validate(seh, options);
5962         CheckErrors (*eval, expected_errors);
5963         unit_test_util::SetOrgMod(entry, CSubSource::eSubtype_other, "");
5964     }
5965 
5966 
5967     CLEAR_ERRORS
5968 }
5969 
5970 
BOOST_AUTO_TEST_CASE(Test_Descr_UnnecessaryBioSourceFocus)5971 BOOST_AUTO_TEST_CASE(Test_Descr_UnnecessaryBioSourceFocus)
5972 {
5973     // prepare entry
5974     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5975     unit_test_util::SetFocus(entry);
5976 
5977     STANDARD_SETUP
5978 
5979     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "UnnecessaryBioSourceFocus",
5980                               "BioSource descriptor has focus, but no BioSource feature"));
5981     //AddChromosomeNoLocation(expected_errors, entry);
5982 
5983     eval = validator.Validate(seh, options);
5984     CheckErrors (*eval, expected_errors);
5985 
5986     CLEAR_ERRORS
5987 }
5988 
5989 
BOOST_AUTO_TEST_CASE(Test_Descr_RefGeneTrackingWithoutStatus)5990 BOOST_AUTO_TEST_CASE(Test_Descr_RefGeneTrackingWithoutStatus)
5991 {
5992     // prepare entry
5993     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
5994     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
5995     CRef<CSeqdesc> desc(new CSeqdesc());
5996     desc->SetUser().SetObjectType(CUser_object::eObjectType_RefGeneTracking);
5997     entry->SetSeq().SetDescr().Set().push_back(desc);
5998 
5999     CRef<CUser_field> field(new CUser_field());
6000     field->SetLabel().SetStr("Label");
6001     field->SetData().SetStr("Data");
6002     desc->SetUser().SetData().push_back(field);
6003 
6004     STANDARD_SETUP
6005 
6006     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "RefGeneTrackingWithoutStatus",
6007                               "RefGeneTracking object needs to have Status set"));
6008     //AddChromosomeNoLocation(expected_errors, entry);
6009 
6010     eval = validator.Validate(seh, options);
6011     CheckErrors (*eval, expected_errors);
6012 
6013     CLEAR_ERRORS
6014 }
6015 
6016 
BOOST_AUTO_TEST_CASE(Test_Descr_UnwantedCompleteFlag)6017 BOOST_AUTO_TEST_CASE(Test_Descr_UnwantedCompleteFlag)
6018 {
6019     // prepare entry
6020     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6021     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
6022     unit_test_util::SetCompleteness(entry, CMolInfo::eCompleteness_complete);
6023     SetTitle(entry, "a title without the word");
6024 
6025     STANDARD_SETUP
6026 
6027     expected_errors.push_back(new CExpectedError("gb|AY123456|", eDiag_Warning, "UnwantedCompleteFlag",
6028                               "Suspicious use of complete"));
6029     //AddChromosomeNoLocation(expected_errors, entry);
6030 
6031     eval = validator.Validate(seh, options);
6032     CheckErrors (*eval, expected_errors);
6033 
6034     // tech of HTGS3 lowers to warning
6035     SetTech(entry, CMolInfo::eTech_htgs_3);
6036     expected_errors[0]->SetSeverity(eDiag_Warning);
6037     eval = validator.Validate(seh, options);
6038     CheckErrors (*eval, expected_errors);
6039 
6040     CLEAR_ERRORS
6041 
6042     // suppress if complete sequence or complete genome in title
6043     SetTitle(entry, "complete sequence");
6044     //AddChromosomeNoLocation(expected_errors, entry);
6045     eval = validator.Validate(seh, options);
6046     CheckErrors (*eval, expected_errors);
6047 
6048     CLEAR_ERRORS
6049 
6050     // suppress if viral
6051     scope.RemoveTopLevelSeqEntry(seh);
6052     SetTitle(entry, "a title without the word");
6053     entry->SetSeq().SetId().front()->SetEmbl().SetAccession("AY123457");
6054     unit_test_util::SetLineage(entry, "Viruses");
6055     //AddChromosomeNoLocation(expected_errors, entry);
6056     seh = scope.AddTopLevelSeqEntry(*entry);
6057     eval = validator.Validate(seh, options);
6058     CheckErrors (*eval, expected_errors);
6059 
6060     // suppress if artificial
6061     unit_test_util::SetLineage(entry, "Bacteria");
6062     unit_test_util::SetOrigin(entry, CBioSource::eOrigin_artificial);
6063     CheckErrors (*eval, expected_errors);
6064 
6065     CLEAR_ERRORS
6066 }
6067 
6068 
BOOST_AUTO_TEST_CASE(Test_Descr_CollidingPublications)6069 BOOST_AUTO_TEST_CASE(Test_Descr_CollidingPublications)
6070 {
6071     // prepare entry
6072     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6073     CRef<CSeqdesc> pub1 = unit_test_util::BuildGoodPubSeqdesc();
6074     CRef<CAuthor> auth1 = unit_test_util::BuildGoodAuthor();
6075     CRef<CPub> otherpub1(new CPub());
6076     otherpub1->SetArticle().SetAuthors().SetNames().SetStd().push_back(auth1);
6077     CRef<CCit_art::TTitle::C_E> title1(new CCit_art::TTitle::C_E());
6078     title1->SetName("First title");
6079     otherpub1->SetArticle().SetTitle().Set().push_back(title1);
6080     pub1->SetPub().SetPub().Set().push_back(otherpub1);
6081     entry->SetSeq().SetDescr().Set().push_back(pub1);
6082     CRef<CSeqdesc> pub2 = unit_test_util::BuildGoodPubSeqdesc();
6083     CRef<CPub> otherpub2(new CPub());
6084     CRef<CAuthor> auth2 = unit_test_util::BuildGoodAuthor();
6085     otherpub2->SetArticle().SetAuthors().SetNames().SetStd().push_back(auth1);
6086     CRef<CCit_art::TTitle::C_E> title2(new CCit_art::TTitle::C_E());
6087     title2->SetName("Second title");
6088     otherpub2->SetArticle().SetTitle().Set().push_back(title2);
6089     pub2->SetPub().SetPub().Set().push_back(otherpub2);
6090     entry->SetSeq().SetDescr().Set().push_back(pub2);
6091 
6092     STANDARD_SETUP
6093 
6094     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CollidingPubMedID",
6095                               "Multiple publications with identical PubMed ID"));
6096     //AddChromosomeNoLocation(expected_errors, entry);
6097 
6098     eval = validator.Validate(seh, options);
6099     CheckErrors (*eval, expected_errors);
6100 
6101     // should also report muid collisions
6102     pub1->SetPub().SetPub().Set().front()->SetMuid(ENTREZ_ID_CONST(2));
6103     pub2->SetPub().SetPub().Set().front()->SetMuid(ENTREZ_ID_CONST(2));
6104     eval = validator.Validate(seh, options);
6105     CheckErrors (*eval, expected_errors);
6106 
6107     // look for same pub twice
6108     title2->SetName("First title");
6109     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CollidingPublications",
6110         "Multiple equivalent publications annotated on this sequence [Last|Ft; Last]"));
6111     eval = validator.Validate(seh, options);
6112     CheckErrors (*eval, expected_errors);
6113 
6114     delete expected_errors[1];
6115     expected_errors.pop_back();
6116 
6117     // look for multiple IDs on same pub
6118     scope.RemoveTopLevelSeqEntry(seh);
6119     entry->SetSeq().SetDescr().Set().pop_back();
6120     CRef<CPub> extra_id(new CPub());
6121     extra_id->SetMuid(ENTREZ_ID_CONST(3));
6122     pub1->SetPub().SetPub().Set().push_back(extra_id);
6123     seh = scope.AddTopLevelSeqEntry(*entry);
6124     expected_errors[0]->SetErrCode("CollidingPublications");
6125     expected_errors[0]->SetErrMsg("Multiple conflicting muids in a single publication");
6126     eval = validator.Validate(seh, options);
6127     CheckErrors (*eval, expected_errors);
6128     extra_id->SetMuid(ENTREZ_ID_CONST(2));
6129     expected_errors[0]->SetErrMsg("Multiple redundant muids in a single publication");
6130     eval = validator.Validate(seh, options);
6131     CheckErrors (*eval, expected_errors);
6132     pub1->SetPub().SetPub().Set().front()->SetPmid(CPub::TPmid(ENTREZ_ID_CONST(2)));
6133     extra_id->SetPmid(CPub::TPmid(ENTREZ_ID_CONST(3)));
6134     expected_errors[0]->SetErrMsg("Multiple conflicting pmids in a single publication");
6135     eval = validator.Validate(seh, options);
6136     CheckErrors (*eval, expected_errors);
6137     extra_id->SetPmid(CPub::TPmid(ENTREZ_ID_CONST(2)));
6138     expected_errors[0]->SetErrMsg("Multiple redundant pmids in a single publication");
6139     eval = validator.Validate(seh, options);
6140     CheckErrors (*eval, expected_errors);
6141 
6142     CLEAR_ERRORS
6143 }
6144 
6145 
BOOST_AUTO_TEST_CASE(Test_Descr_TransgenicProblem)6146 BOOST_AUTO_TEST_CASE(Test_Descr_TransgenicProblem)
6147 {
6148     // prepare entry
6149     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6150     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_transgenic, "true");
6151 
6152     STANDARD_SETUP
6153 
6154     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TransgenicProblem",
6155                               "Transgenic source descriptor requires presence of source feature"));
6156     //AddChromosomeNoLocation(expected_errors, entry);
6157 
6158     eval = validator.Validate(seh, options);
6159     CheckErrors (*eval, expected_errors);
6160 
6161     CLEAR_ERRORS
6162 
6163     scope.RemoveTopLevelSeqEntry(seh);
6164     // adding source feature turns off warning
6165     AddGoodSourceFeature(entry);
6166     seh = scope.AddTopLevelSeqEntry(*entry);
6167 
6168     //AddChromosomeNoLocation(expected_errors, entry);
6169     eval = validator.Validate(seh, options);
6170     CheckErrors (*eval, expected_errors);
6171 
6172     CLEAR_ERRORS
6173 }
6174 
6175 
BOOST_AUTO_TEST_CASE(Test_Descr_TaxonomyLookupProblem)6176 BOOST_AUTO_TEST_CASE(Test_Descr_TaxonomyLookupProblem)
6177 {
6178     // prepare entry
6179     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6180     unit_test_util::SetTaxname(entry, "Not valid");
6181     unit_test_util::SetTaxon(entry, 0);
6182 
6183     STANDARD_SETUP
6184 
6185     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NoTaxonID",
6186         "BioSource is missing taxon ID"));
6187     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrganismNotFound",
6188                               "Organism not found in taxonomy database"));
6189     //AddChromosomeNoLocation(expected_errors, entry);
6190 
6191     eval = validator.Validate(seh, options);
6192     CheckErrors (*eval, expected_errors);
6193 
6194     CLEAR_ERRORS
6195 
6196     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NoTaxonID",
6197         "BioSource is missing taxon ID"));
6198     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TaxonomyIsSpeciesProblem",
6199                               "Taxonomy lookup reports is_species_level FALSE"));
6200     //AddChromosomeNoLocation(expected_errors, entry);
6201     unit_test_util::SetTaxname(entry, "Poeciliinae");
6202     eval = validator.Validate(seh, options);
6203     CheckErrors (*eval, expected_errors);
6204 
6205     CLEAR_ERRORS
6206 
6207     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NoTaxonID",
6208         "BioSource is missing taxon ID"));
6209     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TaxonomyConsultRequired",
6210                               "Taxonomy lookup reports taxonomy consultation needed"));
6211     //AddChromosomeNoLocation(expected_errors, entry);
6212     unit_test_util::SetTaxname(entry, "Anabaena circinalis");
6213     eval = validator.Validate(seh, options);
6214     CheckErrors (*eval, expected_errors);
6215 
6216     CLEAR_ERRORS
6217 
6218     unit_test_util::SetTaxname(entry, "Homo sapiens");
6219     unit_test_util::SetGenome(entry, CBioSource::eGenome_nucleomorph);
6220     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadOrganelleLocation",
6221                               "Only Chlorarachniophyceae and Cryptophyta have nucleomorphs"));
6222     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NoTaxonID",
6223         "BioSource is missing taxon ID"));
6224     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TaxonomyNucleomorphProblem",
6225                               "Taxonomy lookup does not have expected nucleomorph flag"));
6226     //AddChromosomeNoLocation(expected_errors, entry);
6227     eval = validator.Validate(seh, options);
6228     CheckErrors (*eval, expected_errors);
6229 
6230 
6231     CLEAR_ERRORS
6232 }
6233 
6234 
TestConsultRequired(const string & taxname)6235 void TestConsultRequired(const string& taxname)
6236 {
6237     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6238     unit_test_util::SetTaxname(entry, taxname);
6239     unit_test_util::SetTaxon(entry, 0);
6240 
6241     STANDARD_SETUP
6242 
6243     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NoTaxonID",
6244             "BioSource is missing taxon ID"));
6245     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TaxonomyConsultRequired",
6246         "Taxonomy lookup reports taxonomy consultation needed"));
6247     //AddChromosomeNoLocation(expected_errors, entry);
6248 
6249     eval = validator.Validate(seh, options);
6250     CheckErrors(*eval, expected_errors);
6251 
6252     CLEAR_ERRORS
6253 
6254 }
6255 
6256 
BOOST_AUTO_TEST_CASE(Test_VR_857)6257 BOOST_AUTO_TEST_CASE(Test_VR_857)
6258 {
6259     TestConsultRequired("Colletotrichum cliviae");
6260 
6261     // TestConsultRequired("Erythrobacter marisflavi");
6262 }
6263 
6264 
BOOST_AUTO_TEST_CASE(Test_Descr_MultipleTitles)6265 BOOST_AUTO_TEST_CASE(Test_Descr_MultipleTitles)
6266 {
6267     // prepare entry
6268     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6269 
6270     SetTitle(entry, "First title");
6271     CRef<CSeqdesc> desc(new CSeqdesc());
6272     desc->SetTitle("Second title");
6273     entry->SetSeq().SetDescr().Set().push_back(desc);
6274 
6275     STANDARD_SETUP
6276 
6277     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MultipleTitles",
6278                               "Undesired multiple title descriptors"));
6279     //AddChromosomeNoLocation(expected_errors, entry);
6280 
6281     eval = validator.Validate(seh, options);
6282     CheckErrors (*eval, expected_errors);
6283 
6284     CLEAR_ERRORS
6285 }
6286 
6287 
BOOST_AUTO_TEST_CASE(Test_Descr_RefGeneTrackingOnNonRefSeq)6288 BOOST_AUTO_TEST_CASE(Test_Descr_RefGeneTrackingOnNonRefSeq)
6289 {
6290     // prepare entry
6291     CRef<CSeq_entry> entry(new CSeq_entry());
6292     entry->SetSet().SetClass(CBioseq_set::eClass_genbank);
6293     CRef<CSeq_entry> firstseq = unit_test_util::BuildGoodSeq();
6294     AddRefGeneTrackingUserObject (firstseq);
6295     entry->SetSet().SetSeq_set().push_back(firstseq);
6296 
6297     CRef<CSeq_entry> secondseq = unit_test_util::BuildGoodSeq();
6298     secondseq->SetSeq().SetId().front()->SetLocal().SetStr("good2");
6299     entry->SetSet().SetSeq_set().push_back(secondseq);
6300 
6301     STANDARD_SETUP
6302 
6303     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "RefGeneTrackingOnNonRefSeq",
6304                               "RefGeneTracking object should only be in RefSeq record"));
6305     //AddChromosomeNoLocation(expected_errors, "lcl|good");
6306     //AddChromosomeNoLocation(expected_errors, "lcl|good2");
6307 
6308     eval = validator.Validate(seh, options);
6309     CheckErrors (*eval, expected_errors);
6310 
6311     CLEAR_ERRORS
6312 
6313     // no error if any bioseq in record is RefSeq
6314     scope.RemoveTopLevelSeqEntry(seh);
6315     secondseq->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
6316     seh = scope.AddTopLevelSeqEntry(*entry);
6317     //AddChromosomeNoLocation(expected_errors, entry);
6318     eval = validator.Validate(seh, options);
6319     CheckErrors (*eval, expected_errors);
6320 
6321     CLEAR_ERRORS
6322 }
6323 
6324 
BOOST_AUTO_TEST_CASE(Test_OrgModMissingValue)6325 BOOST_AUTO_TEST_CASE(Test_OrgModMissingValue)
6326 {
6327     // prepare entry
6328     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6329     unit_test_util::SetTaxname(entry, "Arabidopsis thaliana");
6330     unit_test_util::SetTaxon(entry, 0);
6331     unit_test_util::SetTaxon(entry, 3702);
6332     unit_test_util::SetLineage(entry, "Cyanobacteria");
6333     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_variety, "foo");
6334 
6335     STANDARD_SETUP
6336 
6337         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrgModMissingValue",
6338             "Variety value specified is not found in taxname"));
6339     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrganismNotFound",
6340         "Organism not found in taxonomy database (suggested:Arabidopsis thaliana var. foo)"));
6341     //AddChromosomeNoLocation(expected_errors, entry);
6342 
6343     eval = validator.Validate(seh, options);
6344     CheckErrors(*eval, expected_errors);
6345 
6346     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_variety, "");
6347     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_forma, "foo");
6348     expected_errors[0]->SetErrMsg("Forma value specified is not found in taxname");
6349     expected_errors[1]->SetErrMsg("Organism not found in taxonomy database (suggested:Arabidopsis thaliana f. foo)");
6350     eval = validator.Validate(seh, options);
6351     CheckErrors(*eval, expected_errors);
6352 
6353     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_forma, "");
6354     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_sub_species, "foo");
6355     expected_errors[0]->SetErrMsg("Subspecies value specified is not found in taxname");
6356     expected_errors[1]->SetErrMsg("Organism not found in taxonomy database (suggested:Arabidopsis thaliana subsp. foo)");
6357     eval = validator.Validate(seh, options);
6358     CheckErrors(*eval, expected_errors);
6359 
6360     CLEAR_ERRORS
6361         // this one does not cause taxname lookup to fail
6362         unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_sub_species, "");
6363     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_forma_specialis, "foo");
6364     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
6365         "OrgModMissingValue",
6366         "Forma specialis value specified is not found in taxname"));
6367     //AddChromosomeNoLocation(expected_errors, entry);
6368     eval = validator.Validate(seh, options);
6369     CheckErrors(*eval, expected_errors);
6370 
6371     CLEAR_ERRORS
6372 
6373     // some don't produce errors
6374     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_forma_specialis, "");
6375     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_biovar, "foo");
6376     //AddChromosomeNoLocation(expected_errors, entry);
6377     eval = validator.Validate(seh, options);
6378     CheckErrors(*eval, expected_errors);
6379 
6380     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_biovar, "");
6381     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_pathovar, "foo");
6382     eval = validator.Validate(seh, options);
6383     CheckErrors(*eval, expected_errors);
6384 
6385 }
6386 
6387 
BOOST_AUTO_TEST_CASE(Test_BadTextInSourceQualifier)6388 BOOST_AUTO_TEST_CASE(Test_BadTextInSourceQualifier)
6389 {
6390     // descriptive text in non-text qualifiers
6391     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6392     unit_test_util::SetTaxname(entry, "Arabidopsis thaliana");
6393     unit_test_util::SetTaxon(entry, 0);
6394     unit_test_util::SetTaxon(entry, 3702);
6395     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_germline, "a");
6396     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_rearranged, "a");
6397     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_transgenic, "a");
6398     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_environmental_sample, "a");
6399     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_metagenomic, "a");
6400     AddGoodSourceFeature(entry);
6401 
6402     STANDARD_SETUP
6403 
6404     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadTextInSourceQualifier",
6405         "Germline qualifier should not have descriptive text"));
6406     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadTextInSourceQualifier",
6407         "Rearranged qualifier should not have descriptive text"));
6408     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadTextInSourceQualifier",
6409         "Transgenic qualifier should not have descriptive text"));
6410     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadTextInSourceQualifier",
6411         "Environmental_sample qualifier should not have descriptive text"));
6412     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadTextInSourceQualifier",
6413         "Metagenomic qualifier should not have descriptive text"));
6414     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BioSourceInconsistency",
6415         "Germline and rearranged should not both be present"));
6416     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BioSourceInconsistency",
6417         "Transgenic and environmental sample should not both be present"));
6418     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "EnvironSampleMissingQualifier",
6419         "Environmental sample should also have isolation source or specific host annotated"));
6420     //AddChromosomeNoLocation(expected_errors, entry);
6421     eval = validator.Validate(seh, options);
6422     CheckErrors(*eval, expected_errors);
6423 
6424     CLEAR_ERRORS
6425 }
6426 
6427 
BOOST_AUTO_TEST_CASE(Test_InvalidSexQualifier)6428 BOOST_AUTO_TEST_CASE(Test_InvalidSexQualifier)
6429 {
6430     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6431     unit_test_util::SetLineage(entry, "Viruses; foo");
6432     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_sex, "a");
6433     unit_test_util::SetLineage(entry, "Bacteria; foo");
6434     STANDARD_SETUP
6435 
6436     // unexpected sex qualifier
6437     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InvalidSexQualifier",
6438         "Unexpected use of /sex qualifier"));
6439     //AddChromosomeNoLocation(expected_errors, entry);
6440     eval = validator.Validate(seh, options);
6441     CheckErrors(*eval, expected_errors);
6442 
6443     CLEAR_ERRORS
6444 
6445     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InvalidSexQualifier",
6446             "Unexpected use of /sex qualifier"));
6447     //AddChromosomeNoLocation(expected_errors, entry);
6448     unit_test_util::SetLineage(entry, "Archaea; foo");
6449     eval = validator.Validate(seh, options);
6450     CheckErrors(*eval, expected_errors);
6451     unit_test_util::SetLineage(entry, "Eukaryota; Fungi; foo");
6452     eval = validator.Validate(seh, options);
6453     CheckErrors(*eval, expected_errors);
6454     unit_test_util::SetLineage(entry, "");
6455     expected_errors[0]->SetErrMsg("Invalid value (a) for /sex qualifier");
6456     expected_errors[0]->SetSeverity(eDiag_Error);
6457     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingLineage",
6458         "No lineage for this BioSource."));
6459     eval = validator.Validate(seh, options);
6460     CheckErrors(*eval, expected_errors);
6461 
6462     CLEAR_ERRORS
6463 
6464     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingLineage",
6465             "No lineage for this BioSource."));
6466     //AddChromosomeNoLocation(expected_errors, entry);
6467 
6468     // no error if acceptable value
6469     vector<string> ok_sex_vals;
6470     ok_sex_vals.push_back("female");
6471     ok_sex_vals.push_back("male");
6472     ok_sex_vals.push_back("hermaphrodite");
6473     ok_sex_vals.push_back("unisexual");
6474     ok_sex_vals.push_back("bisexual");
6475     ok_sex_vals.push_back("asexual");
6476     ok_sex_vals.push_back("monoecious");
6477     ok_sex_vals.push_back("monecious");
6478     ok_sex_vals.push_back("dioecious");
6479     ok_sex_vals.push_back("diecious");
6480 
6481     for (vector<string>::iterator it = ok_sex_vals.begin();
6482         it != ok_sex_vals.end();
6483         ++it) {
6484         unit_test_util::SetSubSource(entry, CSubSource::eSubtype_sex, "");
6485         unit_test_util::SetSubSource(entry, CSubSource::eSubtype_sex, *it);
6486         eval = validator.Validate(seh, options);
6487         CheckErrors(*eval, expected_errors);
6488     }
6489 
6490     CLEAR_ERRORS
6491 
6492         unit_test_util::SetSubSource(entry, CSubSource::eSubtype_sex, "");
6493     // mating-type error for animal
6494     unit_test_util::SetLineage(entry, "Eukaryota; Metazoa; foo");
6495     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_mating_type, "a");
6496     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InvalidMatingType",
6497         "Unexpected use of /mating_type qualifier"));
6498     //AddChromosomeNoLocation(expected_errors, entry);
6499     eval = validator.Validate(seh, options);
6500     CheckErrors(*eval, expected_errors);
6501     // mating-type error for 3 plant lineages
6502     unit_test_util::SetLineage(entry, "Eukaryota; Viridiplantae; Streptophyta; Embryophyta; foo");
6503     eval = validator.Validate(seh, options);
6504     CheckErrors(*eval, expected_errors);
6505     unit_test_util::SetLineage(entry, "Eukaryota; Rhodophyta; foo");
6506     eval = validator.Validate(seh, options);
6507     CheckErrors(*eval, expected_errors);
6508     unit_test_util::SetLineage(entry, "Eukaryota; stramenopiles; Phaeophyceae; foo");
6509     eval = validator.Validate(seh, options);
6510     CheckErrors(*eval, expected_errors);
6511     // mating-type error for virus
6512     unit_test_util::SetLineage(entry, "Viruses; foo");
6513     eval = validator.Validate(seh, options);
6514     CheckErrors(*eval, expected_errors);
6515     // for other lineages, error if sex value
6516     unit_test_util::SetLineage(entry, "Eukaryota; Fungi; foo");
6517 
6518     for (vector<string>::iterator it = ok_sex_vals.begin();
6519         it != ok_sex_vals.end();
6520         ++it) {
6521         unit_test_util::SetSubSource(entry, CSubSource::eSubtype_mating_type, "");
6522         unit_test_util::SetSubSource(entry, CSubSource::eSubtype_mating_type, *it);
6523         eval = validator.Validate(seh, options);
6524         CheckErrors(*eval, expected_errors);
6525     }
6526     CLEAR_ERRORS
6527 
6528     // no error if not valid sex value
6529     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_mating_type, "");
6530     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_mating_type, "a");
6531     //AddChromosomeNoLocation(expected_errors, entry);
6532 
6533     eval = validator.Validate(seh, options);
6534     CheckErrors(*eval, expected_errors);
6535 
6536 }
6537 
6538 
BOOST_AUTO_TEST_CASE(Test_HIVMolType)6539 BOOST_AUTO_TEST_CASE(Test_HIVMolType)
6540 {
6541     // prepare entry
6542     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6543     unit_test_util::SetTaxname(entry, "Human immunodeficiency virus");
6544     unit_test_util::SetTaxon(entry, 0);
6545     unit_test_util::SetTaxon(entry, 12721);
6546     unit_test_util::SetLineage(entry, "Cyanobacteria");
6547     //    unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_variety, "foo");
6548 
6549     STANDARD_SETUP
6550 
6551     // HIV location problems
6552     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_pathovar, "");
6553     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentVirusMoltype",
6554         "HIV with moltype DNA should be proviral"));
6555     eval = validator.Validate(seh, options);
6556     CheckErrors(*eval, expected_errors);
6557 
6558     CLEAR_ERRORS
6559 
6560     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "InconsistentVirusMoltype",
6561             "HIV with mRNA molecule type is rare"));
6562     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
6563     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_mRNA);
6564     eval = validator.Validate(seh, options);
6565     CheckErrors(*eval, expected_errors);
6566 
6567     CLEAR_ERRORS
6568 
6569 }
6570 
BOOST_AUTO_TEST_CASE(Test_MissingPlasmid)6571 BOOST_AUTO_TEST_CASE(Test_MissingPlasmid)
6572 {
6573     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6574     unit_test_util::SetTaxname(entry, "Arabidopsis thaliana");
6575     unit_test_util::SetTaxon(entry, 0);
6576     unit_test_util::SetTaxon(entry, 3702);
6577     unit_test_util::SetLineage(entry, "Cyanobacteria");
6578     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_plasmid_name, "pfoo");
6579 
6580     STANDARD_SETUP
6581 
6582     // plasmid
6583     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingPlasmidLocation",
6584         "Plasmid subsource but not plasmid location"));
6585     eval = validator.Validate(seh, options);
6586     CheckErrors(*eval, expected_errors);
6587     // error goes away if plasmid genome
6588     CLEAR_ERRORS
6589 
6590     unit_test_util::SetGenome(entry, CBioSource::eGenome_plasmid);
6591     //AddChromosomeNoLocation(expected_errors, entry);
6592     eval = validator.Validate(seh, options);
6593     CheckErrors(*eval, expected_errors);
6594 
6595     // if plasmid genome, better have plasmid name
6596     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_plasmid_name, "");
6597     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingPlasmidName",
6598         "Plasmid location set but plasmid name missing. Add a plasmid source modifier with the plasmid name. Use unnamed if the name is not known."));
6599     eval = validator.Validate(seh, options);
6600     CheckErrors(*eval, expected_errors);
6601     CLEAR_ERRORS
6602 
6603 }
6604 
BOOST_AUTO_TEST_CASE(Test_BadPlastidName)6605 BOOST_AUTO_TEST_CASE(Test_BadPlastidName)
6606 {
6607     // prepare entry
6608     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6609     unit_test_util::SetTaxname(entry, "Arabidopsis thaliana");
6610     unit_test_util::SetTaxon(entry, 0);
6611     unit_test_util::SetTaxon(entry, 3702);
6612     unit_test_util::SetLineage(entry, "Cyanobacteria");
6613     unit_test_util::SetGenome(entry, CBioSource::eGenome_unknown);
6614 
6615     STANDARD_SETUP
6616 
6617     // plastid-name
6618     vector<string> plastid_vals;
6619     plastid_vals.push_back("chloroplast");
6620     plastid_vals.push_back("chromoplast");
6621     plastid_vals.push_back("kinetoplast");
6622     plastid_vals.push_back("plastid");
6623     plastid_vals.push_back("apicoplast");
6624     plastid_vals.push_back("leucoplast");
6625     plastid_vals.push_back("proplastid");
6626 
6627     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_plasmid_name, "");
6628     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPlastidName",
6629         "Plastid name subsource chloroplast but not chloroplast location"));
6630     //AddChromosomeNoLocation(expected_errors, entry);
6631     for (vector<string>::iterator it = plastid_vals.begin();
6632         it != plastid_vals.end();
6633         ++it) {
6634         unit_test_util::SetSubSource(entry, CSubSource::eSubtype_plastid_name, "");
6635         unit_test_util::SetSubSource(entry, CSubSource::eSubtype_plastid_name, *it);
6636         expected_errors[0]->SetErrMsg("Plastid name subsource " + *it + " but not " + *it + " location");
6637         eval = validator.Validate(seh, options);
6638         CheckErrors(*eval, expected_errors);
6639     }
6640 
6641     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_plastid_name, "");
6642     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_plastid_name, "unrecognized");
6643     expected_errors[0]->SetErrMsg("Plastid name subsource contains unrecognized value");
6644     eval = validator.Validate(seh, options);
6645     CheckErrors(*eval, expected_errors);
6646 
6647     CLEAR_ERRORS
6648 }
6649 
BOOST_AUTO_TEST_CASE(Test_BadBioSourceFrequencyValue)6650 BOOST_AUTO_TEST_CASE(Test_BadBioSourceFrequencyValue)
6651 {
6652     // prepare entry
6653     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6654     unit_test_util::SetTaxname(entry, "Arabidopsis thaliana");
6655     unit_test_util::SetTaxon(entry, 0);
6656     unit_test_util::SetTaxon(entry, 3702);
6657     unit_test_util::SetLineage(entry, "Cyanobacteria");
6658     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_frequency, "1");
6659 
6660     STANDARD_SETUP
6661     //frequency
6662     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadBioSourceFrequencyValue",
6663             "bad frequency qualifier value 1"));
6664     eval = validator.Validate(seh, options);
6665     CheckErrors(*eval, expected_errors);
6666     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_frequency, "");
6667     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_frequency, "abc");
6668     expected_errors[0]->SetSeverity(eDiag_Warning);
6669     expected_errors[0]->SetErrMsg("bad frequency qualifier value abc");
6670     eval = validator.Validate(seh, options);
6671     CheckErrors(*eval, expected_errors);
6672     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_frequency, "");
6673 
6674     CLEAR_ERRORS
6675 }
6676 
6677 
BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceInconsistency)6678 BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceInconsistency)
6679 {
6680    // prepare entry
6681     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6682     unit_test_util::SetTaxname(entry, "Arabidopsis thaliana");
6683     unit_test_util::SetTaxon(entry, 0);
6684     unit_test_util::SetTaxon(entry, 3702);
6685     unit_test_util::SetLineage(entry, "Cyanobacteria");
6686 //    unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_variety, "foo");
6687 
6688     STANDARD_SETUP
6689 
6690 
6691     // unexpected qualifiers for viruses
6692     unit_test_util::SetLineage(entry, "Viruses; foo");
6693     unit_test_util::SetGenome(entry, CBioSource::eGenome_unknown);
6694     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_sex, "female");
6695     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InvalidSexQualifier",
6696         "Virus has unexpected Sex qualifier"));
6697     eval = validator.Validate(seh, options);
6698     CheckErrors (*eval, expected_errors);
6699     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_sex, "");
6700     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_cell_line, "foo");
6701     expected_errors[0]->SetErrCode("BioSourceInconsistency");
6702     expected_errors[0]->SetErrMsg("Virus has unexpected Cell-line qualifier");
6703     eval = validator.Validate(seh, options);
6704     CheckErrors (*eval, expected_errors);
6705     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_cell_line, "");
6706     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_cell_type, "foo");
6707     expected_errors[0]->SetErrMsg("Virus has unexpected Cell-type qualifier");
6708     eval = validator.Validate(seh, options);
6709     CheckErrors (*eval, expected_errors);
6710     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_cell_type, "");
6711     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_tissue_type, "foo");
6712     expected_errors[0]->SetErrCode("InvalidTissueType");
6713     expected_errors[0]->SetErrMsg("Virus has unexpected Tissue-type qualifier");
6714     eval = validator.Validate(seh, options);
6715     CheckErrors (*eval, expected_errors);
6716     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_tissue_type, "");
6717     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_dev_stage, "foo");
6718     expected_errors[0]->SetErrCode("BioSourceInconsistency");
6719     expected_errors[0]->SetErrMsg("Virus has unexpected Dev-stage qualifier");
6720     eval = validator.Validate(seh, options);
6721     CheckErrors (*eval, expected_errors);
6722     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_dev_stage, "");
6723     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_breed, "bar");
6724     expected_errors[0]->SetErrMsg("Virus has unexpected Breed qualifier");
6725     eval = validator.Validate(seh, options);
6726     CheckErrors (*eval, expected_errors);
6727     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_breed, "");
6728     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_cultivar, "bar");
6729     expected_errors[0]->SetErrMsg("Virus has unexpected Cultivar qualifier");
6730     eval = validator.Validate(seh, options);
6731     CheckErrors (*eval, expected_errors);
6732     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_cultivar, "");
6733 
6734     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_germline, "true");
6735     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_rearranged, "true");
6736     expected_errors[0]->SetErrMsg("Germline and rearranged should not both be present");
6737     eval = validator.Validate(seh, options);
6738     CheckErrors (*eval, expected_errors);
6739     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_germline, "");
6740     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_rearranged, "");
6741 
6742     CLEAR_ERRORS
6743 
6744     scope.RemoveTopLevelSeqEntry(seh);
6745     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_transgenic, "true");
6746     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_environmental_sample, "true");
6747     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_isolation_source, "foo");
6748     unit_test_util::SetFocus(entry);
6749     unit_test_util::AddGoodSourceFeature (entry);
6750     seh = scope.AddTopLevelSeqEntry(*entry);
6751     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BioSourceInconsistency",
6752                               "Transgenic and environmental sample should not both be present"));
6753     //AddChromosomeNoLocation(expected_errors, entry);
6754 
6755     eval = validator.Validate(seh, options);
6756     CheckErrors (*eval, expected_errors);
6757 
6758     CLEAR_ERRORS
6759 
6760     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_transgenic, "");
6761     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_environmental_sample, "");
6762     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_isolation_source, "");
6763     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_metagenomic, "true");
6764     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "MissingEnvironmentalSample",
6765         "Metagenomic should also have environmental sample annotated"));
6766     eval = validator.Validate(seh, options);
6767     CheckErrors (*eval, expected_errors);
6768 
6769     CLEAR_ERRORS
6770 
6771     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_metagenomic, "");
6772     unit_test_util::SetLineage(entry, "Eukaryota; foo");
6773     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_sex, "monecious");
6774     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_mating_type, "A");
6775     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BioSourceInconsistency",
6776         "Sex and mating type should not both be present"));
6777     eval = validator.Validate(seh, options);
6778     CheckErrors (*eval, expected_errors);
6779 
6780     CLEAR_ERRORS
6781 
6782     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_sex, "");
6783     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_mating_type, "");
6784     unit_test_util::SetLineage(entry, "Eukaryota; metagenomes");
6785     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingMetagenomicQualifier",
6786         "If metagenomes appears in lineage, BioSource should have metagenomic qualifier"));
6787     eval = validator.Validate(seh, options);
6788     CheckErrors (*eval, expected_errors);
6789     CLEAR_ERRORS
6790 
6791 
6792     unit_test_util::SetTaxname (entry, "uncultured bacterium");
6793     unit_test_util::SetLineage (entry, "Bacteria; foo");
6794     unit_test_util::SetTaxon(entry, 0);
6795     unit_test_util::SetTaxon(entry, 77133);
6796     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnculturedNeedsEnvSample",
6797         "Uncultured should also have /environmental_sample"));
6798     eval = validator.Validate(seh, options);
6799     CheckErrors (*eval, expected_errors);
6800 
6801     CLEAR_ERRORS
6802 
6803     scope.RemoveTopLevelSeqEntry(seh);
6804     entry = unit_test_util::BuildGoodSeq();
6805     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_environmental_sample, "true");
6806     seh = scope.AddTopLevelSeqEntry(*entry);
6807     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
6808         "EnvironSampleMissingQualifier",
6809         "Environmental sample should also have isolation source or specific host annotated"));
6810     //AddChromosomeNoLocation(expected_errors, entry);
6811     eval = validator.Validate(seh, options);
6812     CheckErrors (*eval, expected_errors);
6813 
6814     CLEAR_ERRORS
6815 
6816     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_environmental_sample, "");
6817     unit_test_util::SetDiv(entry, "BCT");
6818     unit_test_util::SetGenome(entry, CBioSource::eGenome_apicoplast);
6819     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
6820         "BadOrganelleLocation",
6821         "Bacterial or viral source should not have organelle location"));
6822     eval = validator.Validate(seh, options);
6823     CheckErrors (*eval, expected_errors);
6824     unit_test_util::SetDiv(entry, "VRL");
6825     eval = validator.Validate(seh, options);
6826     CheckErrors (*eval, expected_errors);
6827 
6828     CLEAR_ERRORS
6829 
6830     unit_test_util::SetDiv(entry, "ENV");
6831     unit_test_util::SetGenome(entry, CBioSource::eGenome_unknown);
6832     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
6833         "MissingEnvironmentalSample",
6834         "BioSource with ENV division is missing environmental sample subsource"));
6835     eval = validator.Validate(seh, options);
6836     CheckErrors (*eval, expected_errors);
6837 
6838     CLEAR_ERRORS
6839 
6840     unit_test_util::SetDiv(entry, "");
6841     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_environmental_sample, "true");
6842     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_isolation_source, "foo");
6843     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_strain, "bar");
6844     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
6845         "StrainWithEnvironSample",
6846         "Strain should not be present in an environmental sample"));
6847     eval = validator.Validate(seh, options);
6848     CheckErrors (*eval, expected_errors);
6849 
6850     CLEAR_ERRORS
6851 
6852     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_environmental_sample, "");
6853     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_isolation_source, "");
6854     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_strain, "");
6855     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_metagenome_source, "foo");
6856     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
6857         "MissingMetagenomicQualifier",
6858         "Metagenome source should also have metagenomic qualifier"));
6859     eval = validator.Validate(seh, options);
6860     CheckErrors (*eval, expected_errors);
6861 
6862     CLEAR_ERRORS
6863 
6864     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_metagenome_source, "");
6865     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_synonym, "synonym value");
6866     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_gb_synonym, "synonym value");
6867     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
6868         "OrgModValueInvalid",
6869         "OrgMod synonym is identical to OrgMod gb_synonym"));
6870     eval = validator.Validate(seh, options);
6871     CheckErrors (*eval, expected_errors);
6872 
6873     CLEAR_ERRORS
6874 
6875     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_synonym, "");
6876     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_gb_synonym, "");
6877     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_other, "cRNA");
6878     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
6879         "InconsistentVirusMoltype",
6880         "cRNA note conflicts with molecule type"));
6881     eval = validator.Validate(seh, options);
6882     CheckErrors (*eval, expected_errors);
6883 
6884     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_cRNA);
6885     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
6886     expected_errors[0]->SetErrMsg("cRNA note redundant with molecule type");
6887     eval = validator.Validate(seh, options);
6888     CheckErrors (*eval, expected_errors);
6889 
6890     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_other, "");
6891     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_genomic);
6892     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
6893     unit_test_util::SetLineage (entry, "Viruses; no DNA stage");
6894     expected_errors[0]->SetErrMsg("Genomic DNA viral lineage indicates no DNA stage");
6895     eval = validator.Validate(seh, options);
6896     CheckErrors (*eval, expected_errors);
6897 
6898     unit_test_util::SetLineage (entry, "Bacteria; foo");
6899     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_other, "cRNA");
6900     expected_errors[0]->SetErrMsg("cRNA note conflicts with molecule type");
6901     eval = validator.Validate(seh, options);
6902     CheckErrors (*eval, expected_errors);
6903 
6904     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_cRNA);
6905     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
6906     expected_errors[0]->SetErrMsg("cRNA note redundant with molecule type");
6907     eval = validator.Validate(seh, options);
6908     CheckErrors (*eval, expected_errors);
6909 
6910     CLEAR_ERRORS
6911 
6912     scope.RemoveTopLevelSeqEntry(seh);
6913     entry = unit_test_util::BuildGoodSeq();
6914     seh = scope.AddTopLevelSeqEntry(*entry);
6915 
6916     // report missing env_sample/strain/isolate if bacterial and biosample
6917     unit_test_util::SetLineage (entry, "Bacteria; foo");
6918     CRef<CSeqdesc> biosample(new CSeqdesc());
6919     biosample->SetUser().SetType().SetStr("DBLink");
6920     CRef<CUser_field> f(new CUser_field());
6921     f->SetLabel().SetStr("BioSample");
6922     f->SetData().SetStr("PRJNA12345");
6923     biosample->SetUser().SetData().push_back(f);
6924     entry->SetSeq().SetDescr().Set().push_back(biosample);
6925     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BacteriaMissingSourceQualifier",
6926                               "Bacteria should have strain or isolate or environmental sample"));
6927     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DBLinkBadBioSample",
6928                               "Bad BioSample format - PRJNA12345"));
6929     //AddChromosomeNoLocation(expected_errors, entry);
6930     eval = validator.Validate(seh, options);
6931     CheckErrors (*eval, expected_errors);
6932 
6933     CLEAR_ERRORS
6934 
6935     // no error if strain, isolate, or environmental sample set
6936     scope.RemoveTopLevelSeqEntry(seh);
6937     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_strain, "bar");
6938     seh = scope.AddTopLevelSeqEntry(*entry);
6939     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DBLinkBadBioSample",
6940         "Bad BioSample format - PRJNA12345"));
6941     //AddChromosomeNoLocation(expected_errors, entry);
6942     eval = validator.Validate(seh, options);
6943     CheckErrors (*eval, expected_errors);
6944 
6945     scope.RemoveTopLevelSeqEntry(seh);
6946     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_strain, "");
6947     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_isolate, "bar");
6948     seh = scope.AddTopLevelSeqEntry(*entry);
6949     eval = validator.Validate(seh, options);
6950     CheckErrors (*eval, expected_errors);
6951 
6952     scope.RemoveTopLevelSeqEntry(seh);
6953     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_isolate, "");
6954     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_environmental_sample, "true");
6955     seh = scope.AddTopLevelSeqEntry(*entry);
6956     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "EnvironSampleMissingQualifier",
6957                               "Environmental sample should also have isolation source or specific host annotated"));
6958     eval = validator.Validate(seh, options);
6959     CheckErrors (*eval, expected_errors);
6960 
6961     CLEAR_ERRORS
6962 
6963 }
6964 
6965 
BOOST_AUTO_TEST_CASE(Test_VR_173)6966 BOOST_AUTO_TEST_CASE(Test_VR_173)
6967 {
6968     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
6969     unit_test_util::SetLineage(entry, "Bacteria; foo");
6970     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_tissue_type, "X");
6971     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_strain, "Y");
6972     STANDARD_SETUP
6973 
6974     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidTissueType",
6975         "Tissue-type is inappropriate for bacteria"));
6976     //AddChromosomeNoLocation(expected_errors, entry);
6977     eval = validator.Validate(seh, options);
6978     CheckErrors(*eval, expected_errors);
6979 
6980     CLEAR_ERRORS
6981 }
6982 
BOOST_AUTO_TEST_CASE(Test_InconsistentVirusMoltype)6983 BOOST_AUTO_TEST_CASE(Test_InconsistentVirusMoltype)
6984 {
6985     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
6986 
6987     STANDARD_SETUP
6988 
6989     unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses");
6990     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InconsistentVirusMoltype",
6991         "Negative-sense single-stranded RNA virus with plus strand CDS should be cRNA"));
6992     expected_errors[0]->SetAccession("lcl|nuc");
6993     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "MolInfoConflictsWithBioSource",
6994                               "Taxonomy indicates single-stranded RNA, molecule type (DNA) is conflicting."));
6995     expected_errors[1]->SetAccession("lcl|nuc");
6996     eval = validator.Validate(seh, options);
6997     CheckErrors(*eval, expected_errors);
6998     CLEAR_ERRORS
6999 
7000     // error remains if mRNA
7001         unit_test_util::SetBiomol(entry->SetSet().SetSeq_set().front(), CMolInfo::eBiomol_mRNA);
7002     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InconsistentVirusMoltype",
7003         "Negative-sense single-stranded RNA virus with plus strand CDS should be cRNA"));
7004     expected_errors[0]->SetAccession("lcl|nuc");
7005     entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
7006     eval = validator.Validate(seh, options);
7007     //AddChromosomeNoLocation(expected_errors, entry);
7008     CheckErrors(*eval, expected_errors);
7009     // error goes away if mRNA or cRNA or ambisense or synthetic
7010     CLEAR_ERRORS
7011 
7012     unit_test_util::SetBiomol(entry->SetSet().SetSeq_set().front(), CMolInfo::eBiomol_cRNA);
7013     eval = validator.Validate(seh, options);
7014     CheckErrors(*eval, expected_errors);
7015     unit_test_util::SetBiomol(entry->SetSet().SetSeq_set().front(), CMolInfo::eBiomol_genomic);
7016     unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Arenaviridae");
7017     eval = validator.Validate(seh, options);
7018     CheckErrors(*eval, expected_errors);
7019     unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Phlebovirus");
7020     eval = validator.Validate(seh, options);
7021     CheckErrors(*eval, expected_errors);
7022     unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Tospovirus");
7023     eval = validator.Validate(seh, options);
7024     CheckErrors(*eval, expected_errors);
7025     unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Tenuivirus");
7026     eval = validator.Validate(seh, options);
7027     CheckErrors(*eval, expected_errors);
7028     unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses");
7029     unit_test_util::SetOrigin(entry, CBioSource::eOrigin_synthetic);
7030     unit_test_util::SetBiomol(entry->SetSet().SetSeq_set().front(), CMolInfo::eBiomol_other);
7031     eval = validator.Validate(seh, options);
7032     CheckErrors(*eval, expected_errors);
7033     unit_test_util::SetBiomol(entry->SetSet().SetSeq_set().front(), CMolInfo::eBiomol_genomic);
7034     unit_test_util::SetDiv(entry, "VRL");
7035     unit_test_util::SetOrigin(entry, CBioSource::eOrigin_mut);
7036     eval = validator.Validate(seh, options);
7037     CheckErrors(*eval, expected_errors);
7038     unit_test_util::SetOrigin(entry, CBioSource::eOrigin_artificial);
7039     unit_test_util::SetBiomol(entry->SetSet().SetSeq_set().front(), CMolInfo::eBiomol_other_genetic);
7040     unit_test_util::SetSynthetic_construct(entry);
7041     eval = validator.Validate(seh, options);
7042     CheckErrors(*eval, expected_errors);
7043 
7044     unit_test_util::SetOrigin(entry, CBioSource::eOrigin_synthetic);
7045     unit_test_util::SetSebaea_microphylla(entry);
7046     unit_test_util::SetBiomol(entry->SetSet().SetSeq_set().front(), CMolInfo::eBiomol_other);
7047     eval = validator.Validate(seh, options);
7048     CheckErrors(*eval, expected_errors);
7049     unit_test_util::SetOrigin(entry, CBioSource::eOrigin_unknown);
7050     unit_test_util::SetBiomol(entry->SetSet().SetSeq_set().front(), CMolInfo::eBiomol_genomic);
7051 
7052     scope.RemoveTopLevelSeqEntry(seh);
7053     unit_test_util::RevComp(entry);
7054     seh = scope.AddTopLevelSeqEntry(*entry);
7055     // still no error if genomic
7056     eval = validator.Validate(seh, options);
7057     CheckErrors(*eval, expected_errors);
7058 
7059     CLEAR_ERRORS
7060 
7061     // error if not genomic
7062     unit_test_util::SetBiomol(entry->SetSet().SetSeq_set().front(), CMolInfo::eBiomol_mRNA);
7063     entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
7064     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "CDSonMinusStrandMRNA",
7065         "CDS should not be on minus strand of mRNA molecule"));
7066     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InconsistentVirusMoltype",
7067         "Negative-sense single-stranded RNA virus with minus strand CDS should be genomic RNA"));
7068     //AddChromosomeNoLocation(expected_errors, entry);
7069     eval = validator.Validate(seh, options);
7070     CheckErrors(*eval, expected_errors);
7071 
7072     CLEAR_ERRORS
7073 
7074         scope.RemoveTopLevelSeqEntry(seh);
7075     entry = unit_test_util::BuildGoodSeq();
7076     unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses");
7077     CRef<CSeq_feat> misc_feat = unit_test_util::AddMiscFeature(entry);
7078     misc_feat->SetComment("nonfunctional");
7079     seh = scope.AddTopLevelSeqEntry(*entry);
7080     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentVirusMoltype",
7081         "Negative-sense single-stranded RNA virus with nonfunctional plus strand misc_feature should be cRNA"));
7082     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MolInfoConflictsWithBioSource",
7083                               "Taxonomy indicates single-stranded RNA, molecule type (DNA) is conflicting."));
7084     //AddChromosomeNoLocation(expected_errors, entry);
7085     eval = validator.Validate(seh, options);
7086     CheckErrors(*eval, expected_errors);
7087 
7088     // error stays if mRNA
7089     CLEAR_ERRORS
7090 
7091     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentVirusMoltype",
7092         "Negative-sense single-stranded RNA virus with nonfunctional plus strand misc_feature should be cRNA"));
7093         unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_mRNA);
7094     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
7095     //AddChromosomeNoLocation(expected_errors, entry);
7096     eval = validator.Validate(seh, options);
7097     CheckErrors(*eval, expected_errors);
7098 
7099     // error goes away if cRNA or ambisense or synthetic
7100     CLEAR_ERRORS
7101 
7102     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_cRNA);
7103     eval = validator.Validate(seh, options);
7104     CheckErrors(*eval, expected_errors);
7105     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_genomic);
7106     unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Arenaviridae");
7107     eval = validator.Validate(seh, options);
7108     CheckErrors(*eval, expected_errors);
7109     unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Phlebovirus");
7110     eval = validator.Validate(seh, options);
7111     CheckErrors(*eval, expected_errors);
7112     unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Tospovirus");
7113     eval = validator.Validate(seh, options);
7114     CheckErrors(*eval, expected_errors);
7115     unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses; Tenuivirus");
7116     eval = validator.Validate(seh, options);
7117     CheckErrors(*eval, expected_errors);
7118 
7119     scope.RemoveTopLevelSeqEntry(seh);
7120     unit_test_util::RevComp(entry);
7121     seh = scope.AddTopLevelSeqEntry(*entry);
7122     // still no error if genomic
7123     eval = validator.Validate(seh, options);
7124     CheckErrors(*eval, expected_errors);
7125 
7126     // error if not genomic
7127     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_mRNA);
7128     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
7129     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentVirusMoltype",
7130         "Ambisense virus should be genomic RNA or cRNA"));
7131     eval = validator.Validate(seh, options);
7132     CheckErrors(*eval, expected_errors);
7133 
7134     CLEAR_ERRORS
7135 
7136     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_mRNA);
7137     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
7138     unit_test_util::SetLineage(entry, "Viruses; negative-strand viruses");
7139     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentVirusMoltype",
7140         "Negative-sense single-stranded RNA virus with nonfunctional minus strand misc_feature should be genomic RNA"));
7141     eval = validator.Validate(seh, options);
7142     CheckErrors(*eval, expected_errors);
7143 
7144     CLEAR_ERRORS
7145 
7146 
7147 }
7148 
7149 
BOOST_AUTO_TEST_CASE(Test_SingleStrandViruses)7150 BOOST_AUTO_TEST_CASE(Test_SingleStrandViruses)
7151 {
7152     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
7153     unit_test_util::SetLineage(entry, "Viruses; ssRNA positive-strand viruses");
7154     unit_test_util::SetBiomol(entry->SetSet().SetSeq_set().front(), CMolInfo::eBiomol_cRNA);
7155     entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
7156 
7157     STANDARD_SETUP
7158 
7159     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InconsistentVirusMoltype",
7160         "Positive-sense single-stranded RNA virus should be genomic RNA"));
7161     //AddChromosomeNoLocation(expected_errors, entry);
7162     eval = validator.Validate(seh, options);
7163     CheckErrors(*eval, expected_errors);
7164 
7165     // error goes away if ambisense or synthetic
7166     CLEAR_ERRORS
7167 
7168     //AddChromosomeNoLocation(expected_errors, entry);
7169     unit_test_util::SetLineage(entry, "Viruses; ssRNA positive-strand viruses; Arenaviridae");
7170     eval = validator.Validate(seh, options);
7171     CheckErrors(*eval, expected_errors);
7172     unit_test_util::SetLineage(entry, "Viruses; ssRNA positive-strand viruses; Phlebovirus");
7173     eval = validator.Validate(seh, options);
7174     CheckErrors(*eval, expected_errors);
7175     unit_test_util::SetLineage(entry, "Viruses; ssRNA positive-strand viruses; Tospovirus");
7176     eval = validator.Validate(seh, options);
7177     CheckErrors(*eval, expected_errors);
7178     unit_test_util::SetLineage(entry, "Viruses; ssRNA positive-strand viruses; Tenuivirus");
7179     eval = validator.Validate(seh, options);
7180     CheckErrors(*eval, expected_errors);
7181     unit_test_util::SetLineage(entry, "Viruses; ssRNA positive-strand viruses");
7182     unit_test_util::SetOrigin(entry, CBioSource::eOrigin_synthetic);
7183     eval = validator.Validate(seh, options);
7184     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InvalidForType",
7185         "Molinfo-biomol other should be used if Biosource-location is synthetic"));
7186     CheckErrors(*eval, expected_errors);
7187     CLEAR_ERRORS
7188     unit_test_util::SetDiv(entry, "VRL");
7189     unit_test_util::SetOrigin(entry, CBioSource::eOrigin_mut);
7190     //AddChromosomeNoLocation(expected_errors, entry);
7191     eval = validator.Validate(seh, options);
7192     CheckErrors(*eval, expected_errors);
7193     unit_test_util::SetOrigin(entry, CBioSource::eOrigin_artificial);
7194     unit_test_util::SetSynthetic_construct(entry);
7195     eval = validator.Validate(seh, options);
7196     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InvalidForType",
7197         "artificial origin should have other-genetic"));
7198     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "SyntheticConstructWrongMolType",
7199         "synthetic construct should have other-genetic"));
7200     CheckErrors(*eval, expected_errors);
7201 
7202     CLEAR_ERRORS
7203 }
7204 
7205 
BOOST_AUTO_TEST_CASE(Test_Descr_FastaBracketTitle)7206 BOOST_AUTO_TEST_CASE(Test_Descr_FastaBracketTitle)
7207 {
7208     // prepare entry
7209     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7210     SetTitle (entry, "[a=b]");
7211 
7212     STANDARD_SETUP
7213 
7214     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FastaBracketTitle",
7215                               "Title may have unparsed [...=...] construct"));
7216     //AddChromosomeNoLocation(expected_errors, entry);
7217 
7218     eval = validator.Validate(seh, options);
7219     CheckErrors (*eval, expected_errors);
7220 
7221     CLEAR_ERRORS
7222 
7223     // no error if TMSMART or BankIt
7224     scope.RemoveTopLevelSeqEntry(seh);
7225     CRef<CSeq_id> other(new CSeq_id());
7226     other->SetGeneral().SetDb("TMSMART");
7227     other->SetGeneral().SetTag().SetStr("good");
7228     entry->SetSeq().SetId().push_back(other);
7229     seh = scope.AddTopLevelSeqEntry(*entry);
7230     //AddChromosomeNoLocation(expected_errors, entry);
7231     eval = validator.Validate(seh, options);
7232     CheckErrors (*eval, expected_errors);
7233     CLEAR_ERRORS
7234 
7235     scope.RemoveTopLevelSeqEntry(seh);
7236     other->SetGeneral().SetDb("BankIt");
7237     seh = scope.AddTopLevelSeqEntry(*entry);
7238     //AddChromosomeNoLocation(expected_errors, entry);
7239     eval = validator.Validate(seh, options);
7240     CheckErrors (*eval, expected_errors);
7241 
7242     CLEAR_ERRORS
7243 }
7244 
7245 
BOOST_AUTO_TEST_CASE(Test_Descr_MissingText)7246 BOOST_AUTO_TEST_CASE(Test_Descr_MissingText)
7247 {
7248     // prepare entry
7249     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7250     CRef<CSeqdesc> desc(new CSeqdesc());
7251     desc->SetComment();
7252     entry->SetSeq().SetDescr().Set().push_back(desc);
7253 
7254     STANDARD_SETUP
7255 
7256     // comment
7257     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "CommentMissingText",
7258                               "Comment descriptor needs text"));
7259     //AddChromosomeNoLocation(expected_errors, entry);
7260 
7261     eval = validator.Validate(seh, options);
7262     CheckErrors (*eval, expected_errors);
7263     CLEAR_ERRORS
7264 
7265     // title
7266     scope.RemoveTopLevelSeqEntry(seh);
7267     desc->SetTitle();
7268     seh = scope.AddTopLevelSeqEntry(*entry);
7269     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
7270         "TitleMissingText", "Title descriptor needs text"));
7271     //AddChromosomeNoLocation(expected_errors, entry);
7272     eval = validator.Validate(seh, options);
7273     CheckErrors (*eval, expected_errors);
7274     CLEAR_ERRORS
7275 
7276     // name
7277     scope.RemoveTopLevelSeqEntry(seh);
7278     desc->SetName();
7279     seh = scope.AddTopLevelSeqEntry(*entry);
7280     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
7281         "MissingText", "Name descriptor needs text"));
7282     //AddChromosomeNoLocation(expected_errors, entry);
7283     eval = validator.Validate(seh, options);
7284     CheckErrors (*eval, expected_errors);
7285     CLEAR_ERRORS
7286 
7287     // region
7288     scope.RemoveTopLevelSeqEntry(seh);
7289     desc->SetRegion();
7290     seh = scope.AddTopLevelSeqEntry(*entry);
7291     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "RegionMissingText",
7292                               "Region descriptor needs text"));
7293     //AddChromosomeNoLocation(expected_errors, entry);
7294     eval = validator.Validate(seh, options);
7295     CheckErrors (*eval, expected_errors);
7296 
7297     CLEAR_ERRORS
7298 }
7299 
7300 
BOOST_AUTO_TEST_CASE(Test_Descr_BadCollectionDate)7301 BOOST_AUTO_TEST_CASE(Test_Descr_BadCollectionDate)
7302 {
7303     // prepare entry
7304     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7305     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "May 1, 2010");
7306 
7307     STANDARD_SETUP
7308 
7309     // bad format
7310     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadCollectionDate",
7311                               "Collection_date format is not in DD-Mmm-YYYY format"));
7312     //AddChromosomeNoLocation(expected_errors, entry);
7313 
7314     eval = validator.Validate(seh, options);
7315     CheckErrors (*eval, expected_errors);
7316 
7317     // still bad format
7318     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "");
7319     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "1-05-2010");
7320     eval = validator.Validate(seh, options);
7321     CheckErrors (*eval, expected_errors);
7322 
7323     // range has bad format
7324     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "");
7325     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "21-Oct-2013-20-Oct-2015");
7326     eval = validator.Validate(seh, options);
7327     CheckErrors (*eval, expected_errors);
7328 
7329     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "");
7330     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "31-Dec-2099");
7331     expected_errors[0]->SetErrMsg("Collection_date is in the future");
7332     eval = validator.Validate(seh, options);
7333     CheckErrors (*eval, expected_errors);
7334 
7335     // range in future
7336     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "");
7337     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "21-Oct-2013/20-Oct-2030");
7338     eval = validator.Validate(seh, options);
7339     CheckErrors (*eval, expected_errors);
7340 
7341     CLEAR_ERRORS
7342 
7343     // ISO date should be ok
7344     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "");
7345     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "2003-09-29");
7346     //AddChromosomeNoLocation(expected_errors, entry);
7347     eval = validator.Validate(seh, options);
7348     CheckErrors (*eval, expected_errors);
7349 
7350     // range of dates should be ok
7351     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "");
7352     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "Aug-2012/Jan-2013");
7353     eval = validator.Validate(seh, options);
7354     CheckErrors (*eval, expected_errors);
7355     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "");
7356     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "2012/2013");
7357     eval = validator.Validate(seh, options);
7358     CheckErrors (*eval, expected_errors);
7359     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "");
7360     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_collection_date, "06-Aug-2004/07-Jan-2007");
7361     eval = validator.Validate(seh, options);
7362     CheckErrors (*eval, expected_errors);
7363 
7364     bool bad_format = false, in_future = false;
7365     CSubSource::IsCorrectDateFormat("29-Feb-2012", bad_format, in_future);
7366     BOOST_CHECK_EQUAL(bad_format, false);
7367     BOOST_CHECK_EQUAL(in_future, false);
7368 
7369     CSubSource::IsCorrectDateFormat("2014-06", bad_format, in_future);
7370     BOOST_CHECK_EQUAL(bad_format, false);
7371     BOOST_CHECK_EQUAL(in_future, false);
7372 
7373     CLEAR_ERRORS
7374 }
7375 
7376 
BOOST_AUTO_TEST_CASE(Test_Descr_BadPCRPrimerSequence)7377 BOOST_AUTO_TEST_CASE(Test_Descr_BadPCRPrimerSequence)
7378 {
7379     char bad_ch;
7380     BOOST_CHECK_EQUAL(CPCRPrimerSeq::IsValid("01-May-2010", bad_ch), false);
7381     BOOST_CHECK_EQUAL(bad_ch, '0');
7382 
7383     // prepare entry
7384     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7385     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_fwd_primer_seq, "May 1, 2010");
7386 
7387     STANDARD_SETUP
7388 
7389     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPCRPrimerSequence",
7390                               "PCR forward primer sequence format is incorrect, first bad character is '?'"));
7391     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPCRPrimerSequence",
7392                               "PCR primer does not have both sequences"));
7393     //AddChromosomeNoLocation(expected_errors, entry);
7394 
7395     eval = validator.Validate(seh, options);
7396     CheckErrors (*eval, expected_errors);
7397 
7398     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_fwd_primer_seq, "");
7399     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_rev_primer_seq, "01-May-2010");
7400     expected_errors[0]->SetErrMsg("PCR reverse primer sequence format is incorrect, first bad character is '0'");
7401 
7402     eval = validator.Validate(seh, options);
7403     CheckErrors (*eval, expected_errors);
7404 
7405     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_rev_primer_seq, "");
7406     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_rev_primer_seq, "AAATQAA");
7407     expected_errors[0]->SetErrMsg("PCR reverse primer sequence format is incorrect, first bad character is 'q'");
7408 
7409     eval = validator.Validate(seh, options);
7410     CheckErrors (*eval, expected_errors);
7411 
7412     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_rev_primer_seq, "");
7413     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_rev_primer_seq, "AAATGAA;AA");
7414     expected_errors[0]->SetErrMsg("PCR reverse primer sequence format is incorrect, first bad character is '?'");
7415 
7416     eval = validator.Validate(seh, options);
7417     CheckErrors (*eval, expected_errors);
7418 
7419     CLEAR_ERRORS
7420 
7421     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_rev_primer_seq, "");
7422     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_rev_primer_seq, "(AAATGAA,WW)");
7423     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_fwd_primer_seq, "(AAATGAA,W:W)");
7424 
7425     //AddChromosomeNoLocation(expected_errors, entry);
7426     eval = validator.Validate(seh, options);
7427     CheckErrors (*eval, expected_errors);
7428 
7429     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_fwd_primer_seq, "");
7430     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_rev_primer_seq, "");
7431     NON_CONST_ITERATE(CSeq_descr::Tdata, it, entry->SetSeq().SetDescr().Set()) {
7432         if ((*it)->IsSource()) {
7433             CRef<CPCRPrimer> fwd(new CPCRPrimer());
7434             fwd->SetName().Set("AATTGGCCAATTGGC");
7435             fwd->SetSeq().Set("AATTGGCCAATTGG4C");
7436             CRef<CPCRReaction> reaction(new CPCRReaction());
7437             reaction->SetForward().Set().push_back(fwd);
7438             CRef<CPCRPrimer> rev(new CPCRPrimer());
7439             rev->SetName().Set("AATTGGCCAATTGGC");
7440             rev->SetSeq().Set("AATTGGCCAATTGG5C");
7441             reaction->SetReverse().Set().push_back(rev);
7442             (*it)->SetSource().SetPcr_primers().Set().push_back(reaction);
7443         }
7444     }
7445 
7446     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPCRPrimerSequence",
7447                               "PCR forward primer sequence format is incorrect, first bad character is '4'"));
7448     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPCRPrimerName",
7449                               "PCR forward primer name appears to be a sequence"));
7450     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPCRPrimerSequence",
7451                               "PCR reverse primer sequence format is incorrect, first bad character is '5'"));
7452     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPCRPrimerName",
7453                               "PCR reverse primer name appears to be a sequence"));
7454 
7455     eval = validator.Validate(seh, options);
7456     CheckErrors (*eval, expected_errors);
7457 
7458     CLEAR_ERRORS
7459 }
7460 
BOOST_AUTO_TEST_CASE(Test_Descr_ModifyPCRPrimer)7461 BOOST_AUTO_TEST_CASE(Test_Descr_ModifyPCRPrimer)
7462 {
7463     string fwd_seq;
7464     fwd_seq.assign("5-agtctctctc-");
7465     bool modified = CPCRPrimerSeq::TrimJunk(fwd_seq);
7466     BOOST_CHECK_EQUAL(modified, true);
7467     BOOST_CHECK_EQUAL(fwd_seq, string("agtctctctc"));
7468 
7469     fwd_seq.assign("5`aattggccaattg3'");
7470     modified = CPCRPrimerSeq::TrimJunk(fwd_seq);
7471     BOOST_CHECK_EQUAL(modified, true);
7472     BOOST_CHECK_EQUAL(fwd_seq, string("aattggccaattg"));
7473 
7474     fwd_seq.assign("aattggccaacct");
7475     modified = CPCRPrimerSeq::TrimJunk(fwd_seq);
7476     BOOST_CHECK_EQUAL(modified, false);
7477     BOOST_CHECK_EQUAL(fwd_seq, string("aattggccaacct"));
7478 
7479     fwd_seq.assign("agttt<I>tagaga<i>gac");
7480     modified = CPCRPrimerSeq::Fixi(fwd_seq);
7481     BOOST_CHECK_EQUAL(modified, true);
7482     BOOST_CHECK_EQUAL(fwd_seq, string("agttt<i>tagaga<i>gac"));
7483 
7484     fwd_seq.assign("agtccat<iagata>gtct");
7485     modified = CPCRPrimerSeq::Fixi(fwd_seq);
7486     BOOST_CHECK_EQUAL(modified, true);
7487     BOOST_CHECK_EQUAL(fwd_seq, string("agtccat<i>agata>gtct"));
7488 
7489     fwd_seq.assign("agtccat<i>gtctaaa");
7490     modified = CPCRPrimerSeq::Fixi(fwd_seq);
7491     BOOST_CHECK_EQUAL(modified, false);
7492     BOOST_CHECK_EQUAL(fwd_seq, string("agtccat<i>gtctaaa"));
7493 
7494 }
7495 
BOOST_AUTO_TEST_CASE(Test_Descr_BadPunctuation)7496 BOOST_AUTO_TEST_CASE(Test_Descr_BadPunctuation)
7497 {
7498     // prepare entry
7499     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7500     CRef<CSeqdesc> desc(new CSeqdesc());
7501     desc->SetTitle("abc.");
7502     entry->SetSeq().SetDescr().Set().push_back(desc);
7503 
7504     STANDARD_SETUP
7505 
7506     // end with period
7507     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPunctuation",
7508                               "Title descriptor ends in bad punctuation"));
7509     //AddChromosomeNoLocation(expected_errors, entry);
7510 
7511     eval = validator.Validate(seh, options);
7512     CheckErrors (*eval, expected_errors);
7513 
7514     // end with comma
7515     desc->SetTitle("abc,");
7516     eval = validator.Validate(seh, options);
7517     CheckErrors (*eval, expected_errors);
7518 
7519     // end with semicolon
7520     desc->SetTitle("abc;");
7521     eval = validator.Validate(seh, options);
7522     CheckErrors (*eval, expected_errors);
7523 
7524     // end with colon
7525     desc->SetTitle("abc:");
7526     eval = validator.Validate(seh, options);
7527     CheckErrors (*eval, expected_errors);
7528 
7529     CLEAR_ERRORS
7530 }
7531 
7532 
BOOST_AUTO_TEST_CASE(Test_Descr_BadPCRPrimerName)7533 BOOST_AUTO_TEST_CASE(Test_Descr_BadPCRPrimerName)
7534 {
7535     // prepare entry
7536     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7537     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_fwd_primer_name, "(AAATGAA,WW)");
7538 
7539     STANDARD_SETUP
7540 
7541     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPCRPrimerName",
7542                               "PCR primer name appears to be a sequence"));
7543     //AddChromosomeNoLocation(expected_errors, entry);
7544 
7545     eval = validator.Validate(seh, options);
7546     CheckErrors (*eval, expected_errors);
7547 
7548     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_fwd_primer_name, "");
7549     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_rev_primer_name, "(AAATGAA,W:W)");
7550 
7551     eval = validator.Validate(seh, options);
7552     CheckErrors (*eval, expected_errors);
7553 
7554     CLEAR_ERRORS
7555 
7556     // no error if invalid sequence
7557     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_rev_primer_name, "");
7558     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_rev_primer_name, "AAATQAA");
7559 
7560     //AddChromosomeNoLocation(expected_errors, entry);
7561     eval = validator.Validate(seh, options);
7562     CheckErrors (*eval, expected_errors);
7563 
7564     CLEAR_ERRORS
7565 }
7566 
7567 
BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceOnProtein)7568 BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceOnProtein)
7569 {
7570     // prepare entry
7571     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
7572     unit_test_util::AddGoodSource (entry->SetSet().SetSeq_set().back());
7573 
7574     STANDARD_SETUP
7575 
7576     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BioSourceOnProtein",
7577                               "Nuc-prot set has 1 protein with a BioSource descriptor"));
7578     //AddChromosomeNoLocation(expected_errors, "lcl|nuc");
7579     //AddChromosomeNoLocation(expected_errors, "lcl|prot");
7580 
7581     eval = validator.Validate(seh, options);
7582     CheckErrors (*eval, expected_errors);
7583 
7584     CLEAR_ERRORS
7585 }
7586 
7587 
BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceDbTagConflict)7588 BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceDbTagConflict)
7589 {
7590     // prepare entry
7591     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7592     unit_test_util::SetDbxref (entry, "AFTOL", 12345);
7593     unit_test_util::SetDbxref (entry, "AFTOL", 12346);
7594 
7595     STANDARD_SETUP
7596 
7597     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BioSourceDbTagConflict",
7598                               "BioSource uses db AFTOL multiple times"));
7599     //AddChromosomeNoLocation(expected_errors, entry);
7600 
7601     eval = validator.Validate(seh, options);
7602     CheckErrors (*eval, expected_errors);
7603 
7604     CLEAR_ERRORS
7605 }
7606 
7607 
s_ArePrimersUnique(const CPCRReactionSet & rset)7608 bool s_ArePrimersUnique(const CPCRReactionSet& rset)
7609 {
7610     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7611     CRef<CBioSource> src;
7612     for (auto it : entry->SetSeq().SetDescr().Set()) {
7613         if (it->IsSource()) {
7614             src.Reset(&(it->SetSource()));
7615             break;
7616         }
7617     }
7618     src->SetPcr_primers().Assign(rset);
7619     STANDARD_SETUP
7620     eval = validator.Validate(seh, options);
7621     for (CValidError_CI vit(*eval); vit; ++vit) {
7622         if (NStr::Equal(vit->GetErrCode(), "DuplicatePCRPrimerSequence")) {
7623             return false;
7624         }
7625     }
7626     return true;
7627 }
7628 
7629 
BOOST_AUTO_TEST_CASE(Test_Descr_DuplicatePCRPrimerSequence)7630 BOOST_AUTO_TEST_CASE(Test_Descr_DuplicatePCRPrimerSequence)
7631 {
7632     // prepare entry
7633     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7634     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_fwd_primer_seq, "(AAATTTGGGCCC,AAATTTGGGCCC)");
7635     unit_test_util::SetSubSource (entry, CSubSource::eSubtype_rev_primer_seq, "(CCCTTTGGGCCC,CCCTTTGGGCCC)");
7636 
7637     STANDARD_SETUP
7638 
7639     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "DuplicatePCRPrimerSequence",
7640                               "PCR primer sequence has duplicates"));
7641     //AddChromosomeNoLocation(expected_errors, entry);
7642 
7643     eval = validator.Validate(seh, options);
7644     CheckErrors (*eval, expected_errors);
7645 
7646     CLEAR_ERRORS
7647 
7648     CRef<CPCRPrimer> f1(new CPCRPrimer());
7649     CRef<CPCRPrimer> f2(new CPCRPrimer());
7650     CRef<CPCRPrimer> rv1(new CPCRPrimer());
7651     CRef<CPCRPrimer> rv2(new CPCRPrimer());
7652     CRef<CPCRReaction> r1(new CPCRReaction());
7653     CRef<CPCRReaction> r2(new CPCRReaction());
7654 
7655     CRef<CPCRReactionSet> rset(new CPCRReactionSet());
7656     BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7657     rset->Set().push_back(r1);
7658     BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7659     rset->Set().push_back(r2);
7660     BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), false);
7661     r1->SetForward().Set().push_back(f1);
7662     BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7663     r2->SetForward().Set().push_back(f2);
7664     BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), false);
7665     f1->SetSeq().Set("aa");
7666     BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7667     f2->SetSeq().Set("tt");
7668     BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7669     f2->SetSeq().Set("aa");
7670     BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), false);
7671     r1->SetReverse().Set().push_back(rv1);
7672     BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7673     r2->SetReverse().Set().push_back(rv2);
7674     BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), false);
7675     rv1->SetName().Set("a name");
7676     BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7677     rv2->SetName().Set("a different name");
7678     BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), true);
7679     rv2->SetName().Set("a name");
7680     BOOST_CHECK_EQUAL(s_ArePrimersUnique(*rset), false);
7681 }
7682 
7683 
BOOST_AUTO_TEST_CASE(Test_Descr_MultipleNames)7684 BOOST_AUTO_TEST_CASE(Test_Descr_MultipleNames)
7685 {
7686     // prepare entry
7687     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7688     CRef<CSeqdesc> d1(new CSeqdesc());
7689     d1->SetName("name #1");
7690     entry->SetSeq().SetDescr().Set().push_back(d1);
7691     CRef<CSeqdesc> d2(new CSeqdesc());
7692     d2->SetName("name #1");
7693     entry->SetSeq().SetDescr().Set().push_back(d2);
7694 
7695     STANDARD_SETUP
7696 
7697     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleNames",
7698                               "Undesired multiple name descriptors, identical text"));
7699     //AddChromosomeNoLocation(expected_errors, entry);
7700     eval = validator.Validate(seh, options);
7701     CheckErrors (*eval, expected_errors);
7702 
7703     d2->SetName("name #2");
7704     expected_errors[0]->SetErrMsg("Undesired multiple name descriptors, different text");
7705     eval = validator.Validate(seh, options);
7706     CheckErrors (*eval, expected_errors);
7707 
7708     CLEAR_ERRORS
7709 }
7710 
7711 
BOOST_AUTO_TEST_CASE(Test_Descr_MultipleComments)7712 BOOST_AUTO_TEST_CASE(Test_Descr_MultipleComments)
7713 {
7714     // prepare entry
7715     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7716     CRef<CSeqdesc> d1(new CSeqdesc());
7717     d1->SetComment("name 1");
7718     entry->SetSeq().SetDescr().Set().push_back(d1);
7719     CRef<CSeqdesc> d2(new CSeqdesc());
7720     d2->SetComment("name 1");
7721     entry->SetSeq().SetDescr().Set().push_back(d2);
7722 
7723     STANDARD_SETUP
7724 
7725     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleComments",
7726                               "Undesired multiple comment descriptors, identical text"));
7727     //AddChromosomeNoLocation(expected_errors, entry);
7728     eval = validator.Validate(seh, options);
7729     CheckErrors (*eval, expected_errors);
7730 
7731     CLEAR_ERRORS
7732 
7733     // ok if different
7734     d2->SetComment("name 2");
7735     eval = validator.Validate(seh, options);
7736     //AddChromosomeNoLocation(expected_errors, entry);
7737 
7738     CheckErrors (*eval, expected_errors);
7739     CLEAR_ERRORS
7740 }
7741 
7742 
BOOST_AUTO_TEST_CASE(Test_Descr_LatLonFormat)7743 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonFormat)
7744 {
7745     // prepare entry
7746     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7747     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "40 N 50 E, abc");
7748 
7749     STANDARD_SETUP
7750 
7751     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonFormat",
7752                               "lat_lon format has extra text after correct dd.dd N|S ddd.dd E|W format"));
7753     //AddChromosomeNoLocation(expected_errors, entry);
7754 
7755     eval = validator.Validate(seh, options);
7756     CheckErrors (*eval, expected_errors);
7757 
7758     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "");
7759     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "40 E 50 N");
7760     expected_errors[0]->SetErrMsg("lat_lon format is incorrect - should be dd.dd N|S ddd.dd E|W");
7761     expected_errors[0]->SetSeverity(eDiag_Error);
7762     eval = validator.Validate(seh, options);
7763     CheckErrors (*eval, expected_errors);
7764 
7765     CLEAR_ERRORS
7766 }
7767 
7768 
BOOST_AUTO_TEST_CASE(Test_Descr_LatLonRange)7769 BOOST_AUTO_TEST_CASE(Test_Descr_LatLonRange)
7770 {
7771     // prepare entry
7772     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7773     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "90.1 N 181 E");
7774 
7775     STANDARD_SETUP
7776 
7777     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonRange",
7778                               "latitude value is out of range - should be between 90.00 N and 90.00 S"));
7779     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LatLonRange",
7780                               "longitude value is out of range - should be between 180.00 E and 180.00 W"));
7781     //AddChromosomeNoLocation(expected_errors, entry);
7782     eval = validator.Validate(seh, options);
7783     CheckErrors (*eval, expected_errors);
7784 
7785     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "");
7786     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "90.1 S 181 W");
7787     eval = validator.Validate(seh, options);
7788     CheckErrors (*eval, expected_errors);
7789 
7790     CLEAR_ERRORS
7791 }
7792 
7793 
BOOST_AUTO_TEST_CASE(Test_Descr_BadAltitude)7794 BOOST_AUTO_TEST_CASE(Test_Descr_BadAltitude)
7795 {
7796     // prepare entry
7797     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7798     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_altitude, "123 m");
7799     STANDARD_SETUP
7800 
7801     eval = validator.Validate(seh, options);
7802     //AddChromosomeNoLocation(expected_errors, entry);
7803     CheckErrors (*eval, expected_errors);
7804 
7805     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_altitude, "");
7806     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_altitude, "123");
7807     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadAltitude",
7808                               "'123' is an invalid altitude value, altitude should be provided in meters"));
7809 
7810     eval = validator.Validate(seh, options);
7811     CheckErrors (*eval, expected_errors);
7812 
7813     // raise to error
7814     expected_errors[0]->SetSeverity(eDiag_Error);
7815     eval = validator.Validate(seh, options | CValidator::eVal_genome_submission);
7816     CheckErrors(*eval, expected_errors);
7817 
7818 
7819     CLEAR_ERRORS
7820 
7821     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_altitude, "");
7822     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_altitude, "123 ft.");
7823     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadAltitude",
7824                               "'123 ft.' is an invalid altitude value, altitude should be provided in meters"));
7825     //AddChromosomeNoLocation(expected_errors, entry);
7826 
7827     eval = validator.Validate(seh, options);
7828     CheckErrors (*eval, expected_errors);
7829 
7830     // raise to error
7831     expected_errors[0]->SetSeverity(eDiag_Error);
7832     eval = validator.Validate(seh, options | CValidator::eVal_genome_submission);
7833     CheckErrors(*eval, expected_errors);
7834 
7835     CLEAR_ERRORS
7836 
7837     BOOST_CHECK_EQUAL(CSubSource::FixAltitude("123 ft."), "37 m");
7838 }
7839 
7840 
TestSpecificHostNoError(const string & host)7841 void TestSpecificHostNoError(const string& host)
7842 {
7843     // prepare entry
7844     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7845     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, host);
7846 
7847     STANDARD_SETUP
7848     options |= CValidator::eVal_use_entrez;
7849     //AddChromosomeNoLocation(expected_errors, entry);
7850     eval = validator.Validate(seh, options);
7851     CheckErrors (*eval, expected_errors);
7852     CLEAR_ERRORS
7853 }
7854 
7855 
BOOST_AUTO_TEST_CASE(Test_Descr_BadSpecificHost)7856 BOOST_AUTO_TEST_CASE(Test_Descr_BadSpecificHost)
7857 {
7858     // prepare entry
7859     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
7860     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Metapone madagascaria");
7861 
7862     STANDARD_SETUP
7863 
7864     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadSpecificHost",
7865                               "Specific host value is misspelled: Metapone madagascaria"));
7866     //AddChromosomeNoLocation(expected_errors, entry);
7867     eval = validator.Validate(seh, options);
7868     CheckErrors (*eval, expected_errors);
7869 
7870     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "");
7871     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Homo Sapiens");
7872     expected_errors[0]->SetErrMsg("Specific host value is incorrectly capitalized: Homo Sapiens");
7873     eval = validator.Validate(seh, options);
7874     CheckErrors (*eval, expected_errors);
7875 
7876     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "");
7877     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Homo nonrecognizedus");
7878     expected_errors[0]->SetErrMsg("Invalid value for specific host: Homo nonrecognizedus");
7879     eval = validator.Validate(seh, options);
7880     CheckErrors (*eval, expected_errors);
7881 
7882     CLEAR_ERRORS
7883     // should not generate an error
7884     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "");
7885     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Bovine");
7886     //AddChromosomeNoLocation(expected_errors, entry);
7887     eval = validator.Validate(seh, options);
7888     CheckErrors (*eval, expected_errors);
7889 
7890     // also, can ignore text after semicolon
7891     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "");
7892     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Homo sapiens; sex: female");
7893     eval = validator.Validate(seh, options);
7894     CheckErrors (*eval, expected_errors);
7895 
7896     // should see errors for bad lineages
7897     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "");
7898     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Lentinula edodes");
7899     unit_test_util::SetLineage(entry, "Streptophyta");
7900 
7901     eval = validator.Validate(seh, options);
7902     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
7903         "BadSpecificHost",
7904         "Suspect Host Value - a prokaryote, fungus or virus is suspect as a host for a plant or animal"));
7905     CheckErrors(*eval, expected_errors);
7906     CLEAR_ERRORS
7907 
7908     // others
7909     TestSpecificHostNoError("Racoon");
7910     TestSpecificHostNoError("SNAKE");
7911     TestSpecificHostNoError("Snake");
7912     TestSpecificHostNoError("Turtle");
7913     TestSpecificHostNoError("mallard duck");
7914     TestSpecificHostNoError("Guinea pig");
7915     TestSpecificHostNoError("sea urchin"); // RW-1364
7916 }
7917 
BOOST_AUTO_TEST_CASE(Test_Validity_SpecificHost)7918 BOOST_AUTO_TEST_CASE(Test_Validity_SpecificHost)
7919 {
7920     string host, error_msg;
7921 
7922     host = "home sapiens";
7923     BOOST_CHECK_EQUAL(false, IsSpecificHostValid(host, error_msg));
7924     BOOST_CHECK_EQUAL(error_msg, "Specific host value is misspelled: home sapiens");
7925 
7926     host = "Svalbard rock ptarmigan";
7927     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7928     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7929 
7930     host = "Racoon";
7931     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7932     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7933 
7934     host = "SNAKE";
7935     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7936     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7937 
7938     host = "Snake";
7939     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7940     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7941 
7942     host = "Turtle";
7943     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7944     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7945 
7946 
7947     host = "Homo sapiens";
7948     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7949     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7950 
7951     host = "Homo supiens";
7952     BOOST_CHECK_EQUAL(false, IsSpecificHostValid(host, error_msg));
7953     BOOST_CHECK_EQUAL(error_msg, string("Invalid value for specific host: Homo supiens"));
7954 
7955     host = "Pinus sp.";
7956     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7957     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7958 
7959     host = "Gallus Gallus";
7960     BOOST_CHECK_EQUAL(false, IsSpecificHostValid(host, error_msg));
7961     BOOST_CHECK_EQUAL(error_msg, string("Specific host value is incorrectly capitalized: Gallus Gallus"));
7962 
7963     host = "Eschericia coli";
7964     BOOST_CHECK_EQUAL(false, IsSpecificHostValid(host, error_msg));
7965     BOOST_CHECK_EQUAL(error_msg, string("Specific host value is misspelled: Eschericia coli"));
7966 
7967     host = "Avian";
7968     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7969     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7970 
7971     host = "Bovine";
7972     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7973     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7974 
7975     host = "Pig";
7976     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7977     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7978 
7979     host = "Chicken";
7980     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7981     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7982 
7983     host = "turtle";
7984     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7985     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7986 
7987     host = "Homo sapiens; sex: female";
7988     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7989     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7990 
7991     host = "Guinea pig";
7992     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7993     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7994 
7995     host = "Equus sp.";
7996     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
7997     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
7998 
7999     host = "Ficus sp.";
8000     BOOST_CHECK_EQUAL(true, IsSpecificHostValid(host, error_msg));
8001     BOOST_CHECK_EQUAL(error_msg, kEmptyStr);
8002 }
8003 
8004 
BOOST_AUTO_TEST_CASE(Test_FixSpecificHost)8005 BOOST_AUTO_TEST_CASE(Test_FixSpecificHost)
8006 {
8007     string hostfix, host;
8008 
8009     host = "home sapiens";
8010     hostfix = FixSpecificHost(host);
8011     BOOST_CHECK_EQUAL(hostfix, "Homo sapiens");
8012 
8013     host = "homo sapiens";
8014     hostfix = FixSpecificHost(host);
8015     BOOST_CHECK_EQUAL(hostfix, "Homo sapiens");
8016 
8017     host = "Homo supiens";
8018     hostfix = FixSpecificHost(host);
8019     BOOST_CHECK_EQUAL(hostfix, kEmptyStr);
8020 
8021     host = "Pinus sp.";
8022     hostfix = FixSpecificHost(host);
8023     BOOST_CHECK_EQUAL(hostfix, "Pinus sp.");
8024 
8025     host = "Gallus Gallus";
8026     hostfix = FixSpecificHost(host);
8027     BOOST_CHECK_EQUAL(hostfix, string("Gallus gallus"));
8028 
8029     host = "Eschericia coli";
8030     hostfix = FixSpecificHost(host);
8031     BOOST_CHECK_EQUAL(hostfix, string("Escherichia coli"));
8032 
8033     host = "Avian";
8034     hostfix = FixSpecificHost(host);
8035     BOOST_CHECK_EQUAL(hostfix, host);
8036 
8037     host = "";
8038     hostfix = FixSpecificHost(host);
8039     BOOST_CHECK_EQUAL(hostfix, kEmptyStr);
8040 
8041     host = "Bovine";
8042     hostfix = FixSpecificHost(host);
8043     BOOST_CHECK_EQUAL(hostfix, string("Bovine"));
8044 
8045     host = "Homo sapiens";
8046     hostfix = FixSpecificHost(host);
8047     BOOST_CHECK_EQUAL(hostfix, string("Homo sapiens"));
8048 
8049     host = "Pig";
8050     hostfix = FixSpecificHost(host);
8051     BOOST_CHECK_EQUAL(hostfix, string("Pig"));
8052 
8053     host = " Chicken";
8054     hostfix = FixSpecificHost(host);
8055     BOOST_CHECK_EQUAL(hostfix, string("Chicken"));
8056 
8057     host = "Homo sapiens; sex: female";
8058     hostfix = FixSpecificHost(host);
8059     BOOST_CHECK_EQUAL(hostfix, host);
8060 
8061     host = "HUMAN";
8062     hostfix = FixSpecificHost(host);
8063     BOOST_CHECK_EQUAL(hostfix, "Homo sapiens");
8064 }
8065 
8066 
BOOST_AUTO_TEST_CASE(Test_Descr_RefGeneTrackingIllegalStatus)8067 BOOST_AUTO_TEST_CASE(Test_Descr_RefGeneTrackingIllegalStatus)
8068 {
8069     // prepare entry
8070     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8071     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
8072     AddRefGeneTrackingUserObject(entry);
8073     SetRefGeneTrackingStatus(entry, "unknown");
8074 
8075     STANDARD_SETUP
8076 
8077     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "RefGeneTrackingIllegalStatus",
8078                               "RefGeneTracking object has illegal Status 'unknown'"));
8079     //AddChromosomeNoLocation(expected_errors, entry);
8080     eval = validator.Validate(seh, options);
8081     CheckErrors (*eval, expected_errors);
8082 
8083     CLEAR_ERRORS
8084 }
8085 
8086 
BOOST_AUTO_TEST_CASE(Test_Descr_ReplacedCountryCode)8087 BOOST_AUTO_TEST_CASE(Test_Descr_ReplacedCountryCode)
8088 {
8089     // prepare entry
8090     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8091 
8092     STANDARD_SETUP
8093 
8094     vector<string> old_countries;
8095     old_countries.push_back("Belgian Congo");
8096     old_countries.push_back("British Guiana");
8097     old_countries.push_back("Burma");
8098     old_countries.push_back("Czechoslovakia");
8099     old_countries.push_back("Korea");
8100     old_countries.push_back("Serbia and Montenegro");
8101     old_countries.push_back("Siam");
8102     old_countries.push_back("USSR");
8103     old_countries.push_back("Yugoslavia");
8104     old_countries.push_back("Zaire");
8105     old_countries.push_back("Macedonia");
8106 
8107     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "ReplacedCountryCode",
8108                               ""));
8109     //AddChromosomeNoLocation(expected_errors, entry);
8110 
8111     ITERATE (vector<string>, it, old_countries) {
8112         unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, *it);
8113         expected_errors[0]->SetErrMsg("Replaced country name [" + *it + "]");
8114         eval = validator.Validate(seh, options);
8115         CheckErrors (*eval, expected_errors);
8116         unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "");
8117     }
8118 
8119     CLEAR_ERRORS
8120 }
8121 
8122 
BOOST_AUTO_TEST_CASE(Test_Descr_BadInstitutionCode)8123 BOOST_AUTO_TEST_CASE(Test_Descr_BadInstitutionCode)
8124 {
8125     // prepare entry
8126     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8127 
8128     STANDARD_SETUP
8129 
8130     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadInstitutionCode",
8131                               "Voucher is missing institution code"));
8132     //AddChromosomeNoLocation(expected_errors, entry);
8133     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, ":foo");
8134     eval = validator.Validate(seh, options);
8135     CheckErrors (*eval, expected_errors);
8136 
8137     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "");
8138     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, ":foo");
8139     eval = validator.Validate(seh, options);
8140     CheckErrors (*eval, expected_errors);
8141 
8142     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "");
8143     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, ":foo");
8144     eval = validator.Validate(seh, options);
8145     CheckErrors (*eval, expected_errors);
8146     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "");
8147 
8148     // codes that need disambiguating country
8149     expected_errors[0]->SetSeverity(eDiag_Warning);
8150     vector<string> ambig;
8151     // specimen voucher codes
8152     ambig.push_back("BAH");
8153     ambig.push_back("ACE");
8154     ambig.push_back("SLU");
8155     ambig.push_back("UAB");
8156     ambig.push_back("CAIM");
8157     ambig.push_back("HER");
8158     ambig.push_back("DSC");
8159     ambig.push_back("DNHM");
8160     ambig.push_back("BNHM");
8161     ambig.push_back("UI");
8162     ambig.push_back("KMK");
8163     ambig.push_back("MT");
8164     ambig.push_back("MP");
8165     ambig.push_back("NASC");
8166     ambig.push_back("IZAC");
8167     ambig.push_back("CCG");
8168     ambig.push_back("PIN");
8169     ambig.push_back("HSU");
8170     ambig.push_back("CAUP");
8171     ambig.push_back("ISU");
8172     ambig.push_back("SDSU");
8173     ambig.push_back("GC");
8174     ambig.push_back("UNL");
8175     ambig.push_back("MZUP");
8176     ambig.push_back("MG");
8177     ambig.push_back("HNHM");
8178     ambig.push_back("PMS");
8179     ambig.push_back("LE");
8180     ambig.push_back("GCM");
8181     ambig.push_back("TMP");
8182     ambig.push_back("DMNH");
8183     ambig.push_back("ZMUH");
8184     ambig.push_back("SMF");
8185     ambig.push_back("ZSP");
8186     ambig.push_back("TAU");
8187     ambig.push_back("MJG");
8188     ambig.push_back("DUM");
8189     ambig.push_back("ANU");
8190     ambig.push_back("CPAP");
8191     ambig.push_back("CSU");
8192     ambig.push_back("WACA");
8193     ambig.push_back("MMNH");
8194     ambig.push_back("ALA");
8195     ambig.push_back("RV");
8196     ambig.push_back("ABS");
8197     ambig.push_back("FM");
8198     ambig.push_back("HNU");
8199     ambig.push_back("PO");
8200     ambig.push_back("GAM");
8201     ambig.push_back("MCM");
8202     ambig.push_back("LU");
8203     ambig.push_back("SDM");
8204     ambig.push_back("PMK");
8205     ambig.push_back("VI");
8206     ambig.push_back("IMM");
8207     ambig.push_back("R");
8208     ambig.push_back("CHM");
8209     ambig.push_back("CMC");
8210     ambig.push_back("JSPC");
8211     ambig.push_back("YU");
8212     ambig.push_back("STM");
8213     ambig.push_back("RSM");
8214     ambig.push_back("BB");
8215     ambig.push_back("BHM");
8216     ambig.push_back("CBU");
8217     ambig.push_back("MCCM");
8218     ambig.push_back("NMSU");
8219     ambig.push_back("OTM");
8220     ambig.push_back("LP");
8221     ambig.push_back("SME");
8222     ambig.push_back("PEM");
8223     ambig.push_back("UMF");
8224     ambig.push_back("CIS");
8225     ambig.push_back("LBG");
8226     ambig.push_back("CCAC");
8227     ambig.push_back("SNP");
8228     ambig.push_back("UT");
8229     ambig.push_back("IBA");
8230     ambig.push_back("UNCC");
8231     ambig.push_back("NHMC");
8232     ambig.push_back("BAC");
8233     ambig.push_back("PMG");
8234     ambig.push_back("MRC");
8235     ambig.push_back("ETH");
8236     ambig.push_back("OMC");
8237     ambig.push_back("NMV");
8238     ambig.push_back("MLS");
8239     ambig.push_back("NJM");
8240     ambig.push_back("INA");
8241     ambig.push_back("BCM");
8242     ambig.push_back("YM");
8243     ambig.push_back("CAM");
8244     ambig.push_back("UA");
8245     ambig.push_back("OSM");
8246     ambig.push_back("CPS");
8247     ambig.push_back("POKM");
8248     ambig.push_back("VSM");
8249     ambig.push_back("ZMG");
8250     ambig.push_back("IO");
8251     ambig.push_back("USM");
8252     ambig.push_back("UCS");
8253     ambig.push_back("CN");
8254     ambig.push_back("PCM");
8255     ambig.push_back("MU");
8256     ambig.push_back("ISC");
8257     ambig.push_back("CIB");
8258     ambig.push_back("GML");
8259     ambig.push_back("NU");
8260     ambig.push_back("NCSC");
8261     ambig.push_back("MHNN");
8262     ambig.push_back("NCC");
8263     ambig.push_back("MSM");
8264     ambig.push_back("RM");
8265     ambig.push_back("MBM");
8266     ambig.push_back("UPM");
8267     ambig.push_back("MSU");
8268     ambig.push_back("PI");
8269     ambig.push_back("CENA");
8270     ambig.push_back("IBRP");
8271     ambig.push_back("CRE");
8272     ambig.push_back("FSC");
8273     ambig.push_back("ENCB");
8274     ambig.push_back("BAS");
8275     ambig.push_back("GOE");
8276     ambig.push_back("PSS");
8277     ambig.push_back("CCB");
8278     ambig.push_back("SUM");
8279     ambig.push_back("NMPG");
8280     ambig.push_back("USP");
8281     ambig.push_back("IPB");
8282     ambig.push_back("BCC");
8283     ambig.push_back("FNU");
8284     ambig.push_back("SHM");
8285     ambig.push_back("TNSC");
8286     ambig.push_back("LS");
8287     ambig.push_back("TMC");
8288     ambig.push_back("HUT");
8289     ambig.push_back("ZMUO");
8290     ambig.push_back("ALM");
8291     ambig.push_back("ITCC");
8292     ambig.push_back("TM");
8293     ambig.push_back("WB");
8294     ambig.push_back("ZMK");
8295     ambig.push_back("LBM");
8296     ambig.push_back("NI");
8297     ambig.push_back("CB");
8298     ambig.push_back("AMP");
8299     ambig.push_back("MM");
8300     ambig.push_back("PMU");
8301     ambig.push_back("DM");
8302     ambig.push_back("RIVE");
8303     ambig.push_back("TARI");
8304     ambig.push_back("CSCS");
8305     ambig.push_back("PSU");
8306     ambig.push_back("IMT");
8307     ambig.push_back("MZV");
8308     ambig.push_back("SZE");
8309     ambig.push_back("CUVC");
8310     ambig.push_back("LMJ");
8311     ambig.push_back("UC");
8312     ambig.push_back("ZIUS");
8313     ambig.push_back("FRI");
8314     ambig.push_back("CDA");
8315     ambig.push_back("ZMUA");
8316     ambig.push_back("MZUC");
8317     ambig.push_back("BR");
8318     ambig.push_back("UG");
8319     ambig.push_back("MDH");
8320     ambig.push_back("USD");
8321     ambig.push_back("MNHM");
8322     ambig.push_back("MAD");
8323     ambig.push_back("PMA");
8324     ambig.push_back("ICN");
8325     ambig.push_back("TU");
8326     ambig.push_back("PMNH");
8327     ambig.push_back("SAU");
8328     ambig.push_back("KM");
8329     ambig.push_back("GMNH");
8330     ambig.push_back("SSM");
8331     ambig.push_back("MZ");
8332     ambig.push_back("WSU");
8333     ambig.push_back("CIAN");
8334     ambig.push_back("ZMT");
8335     ambig.push_back("IMS");
8336     ambig.push_back("TCDU");
8337     ambig.push_back("SIAC");
8338     ambig.push_back("DFEC");
8339     ambig.push_back("CBD");
8340     ambig.push_back("SWC");
8341     ambig.push_back("MD");
8342     ambig.push_back("FU");
8343     ambig.push_back("UV");
8344     ambig.push_back("URM");
8345     ambig.push_back("JNU");
8346     ambig.push_back("IZ");
8347     ambig.push_back("UAIC");
8348     ambig.push_back("LEB");
8349     ambig.push_back("MCSN");
8350     ambig.push_back("UU");
8351     ambig.push_back("PUC");
8352     ambig.push_back("SNM");
8353     ambig.push_back("AKU");
8354     ambig.push_back("MH");
8355     ambig.push_back("MOR");
8356     ambig.push_back("IM");
8357     ambig.push_back("MSNT");
8358     ambig.push_back("IGM");
8359     ambig.push_back("NAP");
8360     ambig.push_back("NHMR");
8361     ambig.push_back("MW");
8362     ambig.push_back("PPCC");
8363     ambig.push_back("CNHM");
8364     ambig.push_back("IAL");
8365     ambig.push_back("PCU");
8366     ambig.push_back("HM");
8367 
8368     ITERATE (vector<string>, it, ambig) {
8369         expected_errors[0]->SetErrMsg("Institution code " + *it + " needs to be qualified with a <COUNTRY> designation");
8370         unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, *it + ":foo");
8371         eval = validator.Validate(seh, options);
8372         CheckErrors (*eval, expected_errors);
8373         unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "");
8374     }
8375 
8376     // bio-material
8377     ambig.clear();
8378     ambig.push_back("NASC");
8379     ambig.push_back("TCDU");
8380 
8381     ITERATE (vector<string>, it, ambig) {
8382         expected_errors[0]->SetErrMsg("Institution code " + *it + " needs to be qualified with a <COUNTRY> designation");
8383         unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, *it + ":foo");
8384         eval = validator.Validate(seh, options);
8385         CheckErrors (*eval, expected_errors);
8386         unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "");
8387     }
8388 
8389     // culture-collection
8390     ambig.clear();
8391     ambig.push_back("CAIM");
8392     ambig.push_back("STM");
8393     ambig.push_back("HER");
8394     ambig.push_back("FSC");
8395     ambig.push_back("MDH");
8396     ambig.push_back("DSC");
8397     ambig.push_back("IFM");
8398     ambig.push_back("MCCM");
8399     ambig.push_back("CCB");
8400     ambig.push_back("LBG");
8401     ambig.push_back("BCC");
8402     ambig.push_back("CCAC");
8403     ambig.push_back("CCF");
8404     ambig.push_back("IBA");
8405     ambig.push_back("CAUP");
8406     ambig.push_back("MRC");
8407     ambig.push_back("ETH");
8408     ambig.push_back("TMC");
8409     ambig.push_back("CBD");
8410     ambig.push_back("HUT");
8411     ambig.push_back("URM");
8412     ambig.push_back("NJM");
8413     ambig.push_back("INA");
8414     ambig.push_back("BTCC");
8415     ambig.push_back("YM");
8416     ambig.push_back("IZ");
8417     ambig.push_back("ITCC");
8418     ambig.push_back("WB");
8419     ambig.push_back("LE");
8420     ambig.push_back("LCC");
8421     ambig.push_back("LBM");
8422     ambig.push_back("NI");
8423     ambig.push_back("CB");
8424     ambig.push_back("AMP");
8425     ambig.push_back("RIVE");
8426     ambig.push_back("DUM");
8427     ambig.push_back("AKU");
8428     ambig.push_back("CN");
8429     ambig.push_back("CCDM");
8430     ambig.push_back("PCM");
8431     ambig.push_back("MU");
8432     ambig.push_back("ISC");
8433     ambig.push_back("IMT");
8434     ambig.push_back("NU");
8435     ambig.push_back("RV");
8436     ambig.push_back("UC");
8437     ambig.push_back("NCSC");
8438     ambig.push_back("CCY");
8439     ambig.push_back("NCC");
8440     ambig.push_back("FRI");
8441     ambig.push_back("GAM");
8442     ambig.push_back("RM");
8443     ambig.push_back("MCM");
8444     ambig.push_back("PPCC");
8445     ambig.push_back("CDA");
8446     ambig.push_back("IAL");
8447     ambig.push_back("VI");
8448     ambig.push_back("PCU");
8449     ambig.push_back("CVCC");
8450     ambig.push_back("BR");
8451     ambig.push_back("MSU");
8452     ITERATE (vector<string>, it, ambig) {
8453         expected_errors[0]->SetErrMsg("Institution code " + *it + " needs to be qualified with a <COUNTRY> designation");
8454         unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, *it + ":foo");
8455         eval = validator.Validate(seh, options);
8456         CheckErrors (*eval, expected_errors);
8457         unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "");
8458     }
8459 
8460     expected_errors[0]->SetErrMsg("Institution code zzz is not in list");
8461     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "zzz:foo");
8462     eval = validator.Validate(seh, options);
8463     CheckErrors (*eval, expected_errors);
8464     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "");
8465     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "zzz:foo");
8466     eval = validator.Validate(seh, options);
8467     CheckErrors (*eval, expected_errors);
8468     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "");
8469     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "zzz:foo");
8470     eval = validator.Validate(seh, options);
8471     CheckErrors (*eval, expected_errors);
8472     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "");
8473 
8474     expected_errors[0]->SetErrMsg("Institution code abrc exists, but correct capitalization is ABRC");
8475     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "abrc:x");
8476     eval = validator.Validate(seh, options);
8477     CheckErrors (*eval, expected_errors);
8478     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "");
8479 
8480     expected_errors[0]->SetErrMsg("Institution code a exists, but correct capitalization is A");
8481     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "a:foo");
8482     eval = validator.Validate(seh, options);
8483     CheckErrors (*eval, expected_errors);
8484     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "");
8485 
8486     expected_errors[0]->SetErrMsg("Institution code abkmi exists, but correct capitalization is ABKMI");
8487     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "abkmi:foo");
8488     eval = validator.Validate(seh, options);
8489     CheckErrors (*eval, expected_errors);
8490     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "");
8491 
8492     CLEAR_ERRORS
8493 
8494     // should be ok
8495     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "CCS2009-043");
8496     //AddChromosomeNoLocation(expected_errors, entry);
8497     eval = validator.Validate(seh, options);
8498     CheckErrors (*eval, expected_errors);
8499 
8500     CLEAR_ERRORS
8501 }
8502 
8503 
BOOST_AUTO_TEST_CASE(Test_Descr_BadCollectionCode)8504 BOOST_AUTO_TEST_CASE(Test_Descr_BadCollectionCode)
8505 {
8506     // prepare entry
8507     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8508 
8509     STANDARD_SETUP
8510 
8511     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadCollectionCode",
8512                               "Institution code ABRC exists, but collection ABRC:bar is not in list"));
8513     //AddChromosomeNoLocation(expected_errors, entry);
8514     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "ABRC:bar:foo");
8515     eval = validator.Validate(seh, options);
8516     CheckErrors (*eval, expected_errors);
8517     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "");
8518 
8519     expected_errors[0]->SetErrMsg("Institution code A exists, but collection A:bar is not in list");
8520     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "A:bar:foo");
8521     eval = validator.Validate(seh, options);
8522     CheckErrors (*eval, expected_errors);
8523     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "");
8524 
8525     expected_errors[0]->SetErrMsg("Institution code ABKMI exists, but collection ABKMI:bar is not in list");
8526     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "ABKMI:bar:foo");
8527     eval = validator.Validate(seh, options);
8528     CheckErrors (*eval, expected_errors);
8529     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "");
8530 
8531     CLEAR_ERRORS
8532 
8533     // DNA is ok for biomaterial
8534     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "ABRC:DNA:foo");
8535     //AddChromosomeNoLocation(expected_errors, entry);
8536     eval = validator.Validate(seh, options);
8537     CheckErrors (*eval, expected_errors);
8538     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "");
8539 
8540     CLEAR_ERRORS
8541 }
8542 
8543 
BOOST_AUTO_TEST_CASE(Test_Descr_IncorrectlyFormattedVoucherID)8544 BOOST_AUTO_TEST_CASE(Test_Descr_IncorrectlyFormattedVoucherID)
8545 {
8546     // prepare entry
8547     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8548 
8549     STANDARD_SETUP
8550 
8551     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "IncorrectlyFormattedVoucherID",
8552                               "Voucher is missing specific identifier"));
8553     //AddChromosomeNoLocation(expected_errors, entry);
8554     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "ABRC:");
8555     eval = validator.Validate(seh, options);
8556     CheckErrors (*eval, expected_errors);
8557     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "");
8558 
8559     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "AAPI:");
8560     eval = validator.Validate(seh, options);
8561     CheckErrors (*eval, expected_errors);
8562     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "");
8563 
8564     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "ABKMI:");
8565     eval = validator.Validate(seh, options);
8566     CheckErrors (*eval, expected_errors);
8567     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "");
8568 
8569     CLEAR_ERRORS
8570 }
8571 
8572 
BOOST_AUTO_TEST_CASE(Test_Descr_UnstructuredVoucher)8573 BOOST_AUTO_TEST_CASE(Test_Descr_UnstructuredVoucher)
8574 {
8575     // prepare entry
8576     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8577 
8578     STANDARD_SETUP
8579 
8580     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "UnstructuredVoucher",
8581                               "Culture_collection should be structured, but is not"));
8582     //AddChromosomeNoLocation(expected_errors, entry);
8583     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "ABKMI");
8584     eval = validator.Validate(seh, options);
8585     CheckErrors (*eval, expected_errors);
8586     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "");
8587 
8588     CLEAR_ERRORS
8589 }
8590 
8591 
BOOST_AUTO_TEST_CASE(Test_Descr_ChromosomeLocation)8592 BOOST_AUTO_TEST_CASE(Test_Descr_ChromosomeLocation)
8593 {
8594     // prepare entry
8595     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8596 
8597     STANDARD_SETUP
8598 
8599     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "ChromosomeLocation",
8600                               "INDEXER_ONLY - BioSource location is chromosome"));
8601     unit_test_util::SetGenome(entry, CBioSource::eGenome_chromosome);
8602     eval = validator.Validate(seh, options);
8603     CheckErrors (*eval, expected_errors);
8604 
8605     CLEAR_ERRORS
8606 }
8607 
8608 
BOOST_AUTO_TEST_CASE(Test_Descr_MultipleSourceQualifiers)8609 BOOST_AUTO_TEST_CASE(Test_Descr_MultipleSourceQualifiers)
8610 {
8611     // prepare entry
8612     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8613 
8614     STANDARD_SETUP
8615 
8616     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleSourceQualifiers",
8617                               "Multiple country qualifiers present"));
8618     //AddChromosomeNoLocation(expected_errors, entry);
8619     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "USA");
8620     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "Zimbabwe");
8621     eval = validator.Validate(seh, options);
8622     CheckErrors (*eval, expected_errors);
8623     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "");
8624 
8625     expected_errors[0]->SetErrMsg("Multiple lat_lon qualifiers present");
8626     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "35 N 50 W");
8627     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "50 N 35 W");
8628     eval = validator.Validate(seh, options);
8629     CheckErrors (*eval, expected_errors);
8630     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "");
8631 
8632     expected_errors[0]->SetErrMsg("Multiple fwd_primer_seq qualifiers present");
8633     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleSourceQualifiers", "Multiple rev_primer_seq qualifiers present"));
8634     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleSourceQualifiers", "Multiple fwd_primer_name qualifiers present"));
8635     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultipleSourceQualifiers", "Multiple rev_primer_name qualifiers present"));
8636     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_fwd_primer_seq, "AATTGGCC");
8637     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_fwd_primer_seq, "CCTTAAAA");
8638     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_rev_primer_seq, "AATTGGCC");
8639     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_rev_primer_seq, "CCTTAAAA");
8640     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_fwd_primer_name, "fwd1");
8641     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_fwd_primer_name, "fwd2");
8642     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_rev_primer_name, "rev1");
8643     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_rev_primer_name, "rev2");
8644     eval = validator.Validate(seh, options);
8645     CheckErrors (*eval, expected_errors);
8646 
8647     CLEAR_ERRORS
8648 }
8649 
8650 
SubSourceHasOtherRules(CSubSource::TSubtype subtype)8651 static bool SubSourceHasOtherRules (CSubSource::TSubtype subtype)
8652 {
8653     if (subtype == CSubSource::eSubtype_sex
8654         || subtype == CSubSource::eSubtype_frequency
8655         || subtype == CSubSource::eSubtype_plasmid_name
8656         || subtype == CSubSource::eSubtype_transposon_name
8657         || subtype == CSubSource::eSubtype_insertion_seq_name
8658         || subtype == CSubSource::eSubtype_plastid_name
8659         || subtype == CSubSource::eSubtype_country
8660         || subtype == CSubSource::eSubtype_lat_lon
8661         || subtype == CSubSource::eSubtype_collection_date
8662         || subtype == CSubSource::eSubtype_fwd_primer_name
8663         || subtype == CSubSource::eSubtype_fwd_primer_seq
8664         || subtype == CSubSource::eSubtype_rev_primer_name
8665         || subtype == CSubSource::eSubtype_rev_primer_seq
8666         || subtype == CSubSource::eSubtype_country) {
8667         return true;
8668     } else {
8669         return false;
8670     }
8671 }
8672 
8673 
OrgModHasOtherRules(COrgMod::TSubtype subtype)8674 static bool OrgModHasOtherRules (COrgMod::TSubtype subtype)
8675 {
8676     if (subtype == COrgMod::eSubtype_variety
8677         || subtype == COrgMod::eSubtype_sub_species
8678         || subtype == COrgMod::eSubtype_forma
8679         || subtype == COrgMod::eSubtype_forma_specialis
8680         || subtype == COrgMod::eSubtype_culture_collection
8681         || subtype == COrgMod::eSubtype_bio_material
8682         || subtype == COrgMod::eSubtype_specimen_voucher
8683         || subtype == COrgMod::eSubtype_metagenome_source) {
8684         return true;
8685     } else {
8686         return false;
8687     }
8688 }
8689 
8690 
CheckUnbalancedParenthesesSubSource(CSubSource::TSubtype subtype,const string & val)8691 void CheckUnbalancedParenthesesSubSource(CSubSource::TSubtype subtype, const string& val)
8692 {
8693     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8694     unit_test_util::SetSubSource(entry, subtype, "");
8695     unit_test_util::SetSubSource(entry, subtype, val);
8696 
8697     STANDARD_SETUP
8698 
8699     if (subtype == CSubSource::eSubtype_segment) {
8700         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NonViralSegment",
8701                 "Non-viral source feature should not have a segment qualifier"));
8702     }
8703 
8704     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "UnbalancedParentheses",
8705             "Unbalanced parentheses in subsource '" + val + "'"));
8706     eval = validator.Validate(seh, options);
8707     CheckErrors(*eval, expected_errors);
8708     CLEAR_ERRORS
8709 }
8710 
8711 
CheckUnbalancedParenthesesOrgMod(COrgMod::TSubtype subtype,const string & val)8712 void CheckUnbalancedParenthesesOrgMod(COrgMod::TSubtype subtype, const string& val)
8713 {
8714     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8715     unit_test_util::SetOrgMod(entry, subtype, "");
8716     unit_test_util::SetOrgMod(entry, subtype, val);
8717 
8718     STANDARD_SETUP
8719 
8720     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "UnbalancedParentheses",
8721             "Unbalanced parentheses in orgmod '" + val + "'"));
8722     //AddChromosomeNoLocation(expected_errors, entry);
8723 
8724     eval = validator.Validate(seh, options);
8725     CheckErrors(*eval, expected_errors);
8726     CLEAR_ERRORS
8727 }
8728 
8729 
BOOST_AUTO_TEST_CASE(Test_Descr_UnbalancedParentheses)8730 BOOST_AUTO_TEST_CASE(Test_Descr_UnbalancedParentheses)
8731 {
8732     // prepare entry
8733     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8734 
8735     STANDARD_SETUP
8736 
8737     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "UnbalancedParentheses",
8738                               "Unbalanced parentheses in taxname 'Malio malefi (abc'"));
8739     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrganismNotFound",
8740                               "Organism not found in taxonomy database"));
8741     //AddChromosomeNoLocation(expected_errors, entry);
8742     unit_test_util::SetTaxname(entry, "Malio malefi (abc");
8743     eval = validator.Validate(seh, options);
8744     CheckErrors (*eval, expected_errors);
8745 
8746     expected_errors[0]->SetErrMsg("Unbalanced parentheses in taxname 'Malio malefi )abc'");
8747     unit_test_util::SetTaxname(entry, "Malio malefi )abc");
8748     eval = validator.Validate(seh, options);
8749     CheckErrors (*eval, expected_errors);
8750     unit_test_util::SetSebaea_microphylla(entry);
8751 
8752     CLEAR_ERRORS
8753 
8754     for (CSubSource::TSubtype subtype = CSubSource::eSubtype_chromosome;
8755          subtype <= CSubSource::eSubtype_haplogroup;
8756          subtype++) {
8757         if (subtype != CSubSource::eSubtype_germline
8758             && subtype != CSubSource::eSubtype_rearranged
8759             && subtype != CSubSource::eSubtype_transgenic
8760             && subtype != CSubSource::eSubtype_environmental_sample
8761             && subtype != CSubSource::eSubtype_metagenomic) {
8762             if (SubSourceHasOtherRules(subtype)) {
8763                 continue;
8764             }
8765             CheckUnbalancedParenthesesSubSource(subtype, "no left (abc");
8766             CheckUnbalancedParenthesesSubSource(subtype, "no right )abc");
8767             CheckUnbalancedParenthesesSubSource(subtype, "no left ( parentheses");
8768             CheckUnbalancedParenthesesSubSource(subtype, "no right ) parentheses");
8769         }
8770     }
8771     // also check other
8772     CheckUnbalancedParenthesesSubSource(CSubSource::eSubtype_other, "no left (abc");
8773     CheckUnbalancedParenthesesSubSource(CSubSource::eSubtype_other, "no right )abc");
8774     CheckUnbalancedParenthesesSubSource(CSubSource::eSubtype_other, "no left ( parentheses");
8775     CheckUnbalancedParenthesesSubSource(CSubSource::eSubtype_other, "no right ) parentheses");
8776 
8777     for (COrgMod::TSubtype subtype = COrgMod::eSubtype_strain;
8778          subtype <= COrgMod::eSubtype_metagenome_source;
8779          subtype++) {
8780         if (OrgModHasOtherRules(subtype)) {
8781             continue;
8782         }
8783         CheckUnbalancedParenthesesOrgMod(subtype, "no left (abc");
8784         CheckUnbalancedParenthesesOrgMod(subtype, "no right )abc");
8785     }
8786     // also check old_lineage and other
8787     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "UnbalancedParentheses",
8788         "Unbalanced parentheses in taxname 'Malio malefi (abc'"));
8789     //AddChromosomeNoLocation(expected_errors, entry);
8790 
8791     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_old_lineage, "no left (abc");
8792     expected_errors[0]->SetErrMsg("Unbalanced parentheses in orgmod 'no left (abc'");
8793     eval = validator.Validate(seh, options);
8794     CheckErrors (*eval, expected_errors);
8795     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_old_lineage, "");
8796     expected_errors[0]->SetErrMsg("Unbalanced parentheses in orgmod 'no right )abc'");
8797     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_old_lineage, "no right )abc");
8798     eval = validator.Validate(seh, options);
8799     CheckErrors (*eval, expected_errors);
8800     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_old_lineage, "");
8801 
8802     CheckUnbalancedParenthesesOrgMod(COrgMod::eSubtype_other, "no left (abc");
8803     CheckUnbalancedParenthesesOrgMod(COrgMod::eSubtype_other, "no right )abc");
8804 
8805     CLEAR_ERRORS
8806     // should get no error for unbalanced parentheses in old name
8807     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_old_name, "no left (abc");
8808     //AddChromosomeNoLocation(expected_errors, entry);
8809     eval = validator.Validate(seh, options);
8810     CheckErrors (*eval, expected_errors);
8811 
8812     CLEAR_ERRORS
8813 }
8814 
8815 
BOOST_AUTO_TEST_CASE(Test_Descr_IdenticalInstitutionCode)8816 BOOST_AUTO_TEST_CASE(Test_Descr_IdenticalInstitutionCode)
8817 {
8818     // prepare entry
8819     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8820 
8821     STANDARD_SETUP
8822 
8823     // no errors if different institutions
8824     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "ABRC:foo");
8825     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "AGRITEC:foo");
8826     //AddChromosomeNoLocation(expected_errors, entry);
8827     eval = validator.Validate(seh, options);
8828     CheckErrors (*eval, expected_errors);
8829     // no errors if collection is DNA
8830     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "");
8831     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "ABRC:DNA:foo");
8832     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "ABRC:DNA:bar");
8833     eval = validator.Validate(seh, options);
8834     CheckErrors (*eval, expected_errors);
8835 
8836     // errors if same institition:collection
8837     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "IdenticalInstitutionCode",
8838         "Multiple vouchers with same institution:collection"));
8839     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "");
8840     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "USDA:CFRA:foo");
8841     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "USDA:CFRA:bar");
8842     eval = validator.Validate(seh, options);
8843     CheckErrors (*eval, expected_errors);
8844 
8845     // errors if same institition:collection
8846     expected_errors[0]->SetErrMsg("Multiple vouchers with same institution");
8847     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "");
8848     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "USDA:CFRA:foo");
8849     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "USDA:GRIN:foo");
8850     eval = validator.Validate(seh, options);
8851     CheckErrors (*eval, expected_errors);
8852 
8853     CLEAR_ERRORS
8854 }
8855 
8856 
BOOST_AUTO_TEST_CASE(Test_Descr_BadCountryCapitalization)8857 BOOST_AUTO_TEST_CASE(Test_Descr_BadCountryCapitalization)
8858 {
8859     // prepare entry
8860     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8861 
8862     STANDARD_SETUP
8863 
8864     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadCountryCapitalization",
8865         "Bad country capitalization [saint pierre and miquelon]"));
8866     //AddChromosomeNoLocation(expected_errors, entry);
8867     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "saint pierre and miquelon");
8868     eval = validator.Validate(seh, options);
8869     CheckErrors (*eval, expected_errors);
8870 
8871     CLEAR_ERRORS
8872 }
8873 
8874 
BOOST_AUTO_TEST_CASE(Test_Descr_WrongVoucherType)8875 BOOST_AUTO_TEST_CASE(Test_Descr_WrongVoucherType)
8876 {
8877     // prepare entry
8878     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8879 
8880     STANDARD_SETUP
8881 
8882     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "WrongVoucherType",
8883         "Institution code ABRC should be bio_material"));
8884     //AddChromosomeNoLocation(expected_errors, entry);
8885     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "ABRC:foo");
8886     eval = validator.Validate(seh, options);
8887     CheckErrors (*eval, expected_errors);
8888     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "");
8889     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "ABRC:foo");
8890     eval = validator.Validate(seh, options);
8891     CheckErrors (*eval, expected_errors);
8892     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "");
8893 
8894     expected_errors[0]->SetErrMsg("Institution code ABKMI should be culture_collection");
8895     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "ABKMI:foo");
8896     eval = validator.Validate(seh, options);
8897     CheckErrors (*eval, expected_errors);
8898     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "");
8899     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "ABKMI:foo");
8900     eval = validator.Validate(seh, options);
8901     CheckErrors (*eval, expected_errors);
8902     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_specimen_voucher, "");
8903 
8904     expected_errors[0]->SetErrMsg("Institution code AA should be specimen_voucher");
8905     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "AA:foo");
8906     eval = validator.Validate(seh, options);
8907     CheckErrors (*eval, expected_errors);
8908     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "");
8909     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "AA:foo");
8910     eval = validator.Validate(seh, options);
8911     CheckErrors (*eval, expected_errors);
8912     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_culture_collection, "");
8913 
8914     CLEAR_ERRORS
8915 }
8916 
8917 
BOOST_AUTO_TEST_CASE(Test_Descr_TitleHasPMID)8918 BOOST_AUTO_TEST_CASE(Test_Descr_TitleHasPMID)
8919 {
8920     // prepare entry
8921     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8922     SetTitle (entry, "foo bar something something (PMID 1)");
8923 
8924     STANDARD_SETUP
8925 
8926     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TitleHasPMID",
8927                               "Title descriptor has internal PMID"));
8928     //AddChromosomeNoLocation(expected_errors, entry);
8929     eval = validator.Validate(seh, options);
8930     CheckErrors (*eval, expected_errors);
8931 
8932     CLEAR_ERRORS
8933 }
8934 
8935 
BOOST_AUTO_TEST_CASE(Test_Descr_BadKeyword)8936 BOOST_AUTO_TEST_CASE(Test_Descr_BadKeyword)
8937 {
8938     // prepare entry
8939     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8940     CRef<CSeqdesc> desc(new CSeqdesc());
8941     desc->SetGenbank().SetKeywords().push_back("BARCODE");
8942     entry->SetSeq().SetDescr().Set().push_back(desc);
8943 
8944     STANDARD_SETUP
8945 
8946     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadKeywordNoTechnique",
8947                               "BARCODE keyword without Molinfo.tech barcode"));
8948     //AddChromosomeNoLocation(expected_errors, entry);
8949     eval = validator.Validate(seh, options);
8950     CheckErrors (*eval, expected_errors);
8951     CLEAR_ERRORS
8952 
8953     entry->SetSeq().SetDescr().Set().pop_back();
8954     SetTech (entry, CMolInfo::eTech_barcode);
8955     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info,
8956         "NoKeywordHasTechnique", "Molinfo.tech barcode without BARCODE keyword"));
8957     //AddChromosomeNoLocation(expected_errors, entry);
8958     eval = validator.Validate(seh, options);
8959     CheckErrors (*eval, expected_errors);
8960 
8961     CLEAR_ERRORS
8962 }
8963 
8964 
BOOST_AUTO_TEST_CASE(Test_Descr_NoOrganismInTitle)8965 BOOST_AUTO_TEST_CASE(Test_Descr_NoOrganismInTitle)
8966 {
8967     // prepare entry
8968     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
8969     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123458");
8970     SetTitle(entry, "Something that does not start with organism");
8971 
8972     STANDARD_SETUP
8973 
8974     expected_errors.push_back(new CExpectedError("ref|NC_123458|", eDiag_Error, "NoOrganismInTitle",
8975                               "RefSeq nucleotide title does not start with organism name"));
8976     //AddChromosomeNoLocation(expected_errors, entry);
8977     eval = validator.Validate(seh, options);
8978     CheckErrors (*eval, expected_errors);
8979 
8980     CLEAR_ERRORS
8981 
8982     scope.RemoveTopLevelSeqEntry(seh);
8983     entry = unit_test_util::BuildGoodNucProtSet();
8984     CRef<CSeq_id> other_id(new CSeq_id());
8985     other_id->SetOther().SetAccession("NP_123456");
8986     unit_test_util::ChangeProtId (entry, other_id);
8987     SetTitle(entry->SetSet().SetSeq_set().back(), "Something that does not end with organism");
8988     seh = scope.AddTopLevelSeqEntry(*entry);
8989 
8990     expected_errors.push_back(new CExpectedError("ref|NP_123456|", eDiag_Error, "NoOrganismInTitle",
8991                                                  "RefSeq protein title does not end with organism name"));
8992     expected_errors.push_back(new CExpectedError("ref|NP_123456|", eDiag_Warning, "InconsistentProteinTitle",
8993         "Instantiated protein title does not match automatically generated title"));
8994     //AddChromosomeNoLocation(expected_errors, entry);
8995     eval = validator.Validate(seh, options);
8996     CheckErrors (*eval, expected_errors);
8997 
8998     CLEAR_ERRORS
8999 }
9000 
9001 
BOOST_AUTO_TEST_CASE(Test_Descr_MissingChromosome)9002 BOOST_AUTO_TEST_CASE(Test_Descr_MissingChromosome)
9003 {
9004     // prepare entry
9005     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
9006     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
9007     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_chromosome, "");
9008 
9009     STANDARD_SETUP
9010 
9011     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "MissingChromosome",
9012                               "Missing chromosome qualifier on NC or AC RefSeq record"));
9013     eval = validator.Validate(seh, options);
9014     CheckErrors (*eval, expected_errors);
9015 
9016     CLEAR_ERRORS
9017 
9018     // error is suppressed if prokaryote or organelle
9019     unit_test_util::SetLineage (entry, "Viruses; foo");
9020     eval = validator.Validate(seh, options);
9021     CheckErrors (*eval, expected_errors);
9022     unit_test_util::SetLineage (entry, "Bacteria; foo");
9023     eval = validator.Validate(seh, options);
9024     CheckErrors (*eval, expected_errors);
9025     CLEAR_ERRORS
9026     unit_test_util::SetLineage (entry, "Archaea; foo");
9027     eval = validator.Validate(seh, options);
9028     CheckErrors (*eval, expected_errors);
9029     unit_test_util::SetLineage (entry, "some lineage");
9030     unit_test_util::SetDiv(entry, "BCT");
9031     eval = validator.Validate(seh, options);
9032     CheckErrors (*eval, expected_errors);
9033     unit_test_util::SetDiv(entry, "VRL");
9034     eval = validator.Validate(seh, options);
9035     CheckErrors (*eval, expected_errors);
9036     unit_test_util::SetDiv(entry, "");
9037 
9038     // error is suppressed if linkage group
9039     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_linkage_group, "x");
9040     eval = validator.Validate(seh, options);
9041     CheckErrors(*eval, expected_errors);
9042     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_linkage_group, "");
9043 
9044     // error is suppressed if organelle
9045     unit_test_util::SetGenome (entry, CBioSource::eGenome_chloroplast);
9046     eval = validator.Validate(seh, options);
9047     CheckErrors (*eval, expected_errors);
9048     unit_test_util::SetGenome (entry, CBioSource::eGenome_chromoplast);
9049     eval = validator.Validate(seh, options);
9050     CheckErrors (*eval, expected_errors);
9051     unit_test_util::SetGenome (entry, CBioSource::eGenome_kinetoplast);
9052     unit_test_util::SetLineage(entry, "some lineage; Kinetoplastida");
9053     eval = validator.Validate(seh, options);
9054     CheckErrors (*eval, expected_errors);
9055     unit_test_util::SetGenome (entry, CBioSource::eGenome_mitochondrion);
9056     eval = validator.Validate(seh, options);
9057     CheckErrors (*eval, expected_errors);
9058     unit_test_util::SetGenome (entry, CBioSource::eGenome_cyanelle);
9059     eval = validator.Validate(seh, options);
9060     CheckErrors (*eval, expected_errors);
9061     unit_test_util::SetGenome (entry, CBioSource::eGenome_nucleomorph);
9062     unit_test_util::SetTaxname(entry, "Bigelowiella natans");
9063     unit_test_util::SetTaxon(entry, 0);
9064     unit_test_util::SetTaxon(entry, 227086);
9065     unit_test_util::SetLineage(entry, "some lineage; Chlorarachniophyceae");
9066     eval = validator.Validate(seh, options);
9067     CheckErrors (*eval, expected_errors);
9068     CLEAR_ERRORS
9069 
9070     unit_test_util::SetGenome (entry, CBioSource::eGenome_apicoplast);
9071     eval = validator.Validate(seh, options);
9072     CheckErrors (*eval, expected_errors);
9073     unit_test_util::SetGenome (entry, CBioSource::eGenome_leucoplast);
9074     eval = validator.Validate(seh, options);
9075     CheckErrors (*eval, expected_errors);
9076     unit_test_util::SetGenome (entry, CBioSource::eGenome_proplastid);
9077     eval = validator.Validate(seh, options);
9078     CheckErrors (*eval, expected_errors);
9079     unit_test_util::SetGenome (entry, CBioSource::eGenome_hydrogenosome);
9080     eval = validator.Validate(seh, options);
9081     CheckErrors (*eval, expected_errors);
9082 
9083 }
9084 
9085 
BOOST_AUTO_TEST_CASE(Test_Descr_BadStructuredCommentFormat)9086 BOOST_AUTO_TEST_CASE(Test_Descr_BadStructuredCommentFormat)
9087 {
9088     // prepare entry
9089     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
9090     CRef<CSeqdesc> desc(new CSeqdesc());
9091     desc->SetUser().SetType().SetStr("StructuredComment");
9092     entry->SetSeq().SetDescr().Set().push_back(desc);
9093 
9094     STANDARD_SETUP
9095 
9096     // no prefix only empty errors
9097     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "StrucCommMissingUserObject",
9098                                                  "Structured Comment user object descriptor is empty"));
9099     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "UserObjectNoData",
9100                                                  "User object with no data"));
9101     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "StrucCommMissingPrefixOrSuffix",
9102                                                  "Structured Comment lacks prefix and/or suffix"));
9103     //AddChromosomeNoLocation(expected_errors, entry);
9104     eval = validator.Validate(seh, options);
9105     CheckErrors (*eval, expected_errors);
9106 
9107     CLEAR_ERRORS
9108 
9109     // unrecognized prefix
9110     CRef<CUser_field> prefix_field(new CUser_field());
9111     prefix_field->SetLabel().SetStr("StructuredCommentPrefix");
9112     prefix_field->SetData().SetStr("Unknown prefix");
9113     desc->SetUser().SetData().push_back(prefix_field);
9114     eval = validator.Validate(seh, options);
9115     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadStrucCommInvalidPrefix",
9116                                     "Unknown prefix is not a valid value for StructuredCommentPrefix"));
9117     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9118     //AddChromosomeNoLocation(expected_errors, entry);
9119     CheckErrors (*eval, expected_errors);
9120 
9121     CLEAR_ERRORS
9122 
9123     // should complain about missing required fields
9124     prefix_field->SetData().SetStr("##Genome-Assembly-Data-START##");
9125     vector<string> required_fields;
9126     /*
9127     required_fields.push_back("Finishing Goal");
9128     required_fields.push_back("Current Finishing Status");
9129     */
9130     required_fields.push_back("Assembly Method");
9131     required_fields.push_back("Genome Coverage");
9132     required_fields.push_back("Sequencing Technology");
9133 
9134     EDiagSev levels[] = { eDiag_Warning, eDiag_Warning, eDiag_Warning, eDiag_Warning, eDiag_Warning };
9135 
9136     int i = 0;
9137     ITERATE(vector<string>, it, required_fields) {
9138         expected_errors.push_back(new CExpectedError("lcl|good", levels[i], "BadStrucCommMissingField",
9139                                   "Required field " + *it + " is missing"));
9140         i++;
9141     }
9142     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9143     //AddChromosomeNoLocation(expected_errors, entry);
9144 
9145     eval = validator.Validate(seh, options);
9146     CheckErrors (*eval, expected_errors);
9147 
9148     CLEAR_ERRORS
9149 
9150     // add fields in wrong order, with bad values where appropriate
9151     const vector<string>& const_required_fields = required_fields;
9152     REVERSE_ITERATE(vector<string>, it, const_required_fields) {
9153         CRef<CUser_field> field(new CUser_field());
9154         field->SetLabel().SetStr(*it);
9155         field->SetData().SetStr("bad value");
9156         desc->SetUser().SetData().push_back(field);
9157     }
9158 
9159     size_t pos = 0;
9160     ITERATE(vector<string>, it, required_fields) {
9161         if (pos < required_fields.size() - 1) {
9162             expected_errors.push_back(new CExpectedError("lcl|good", levels[pos], "BadStrucCommFieldOutOfOrder",
9163                                       *it + " field is out of order"));
9164         }
9165         if (!NStr::Equal(*it, "Genome Coverage") && !NStr::Equal(*it, "Sequencing Technology")) {
9166             expected_errors.push_back(new CExpectedError("lcl|good", levels[pos], "BadStrucCommInvalidFieldValue",
9167                                       "bad value is not a valid value for " + *it));
9168         }
9169         ++pos;
9170     }
9171     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9172     //AddChromosomeNoLocation(expected_errors, entry);
9173 
9174     eval = validator.Validate(seh, options);
9175     CheckErrors (*eval, expected_errors);
9176 
9177     CLEAR_ERRORS
9178 
9179     prefix_field->SetData().SetStr("##MIGS-Data-START##");
9180     required_fields.clear();
9181     required_fields.push_back("alt_elev");
9182     required_fields.push_back("assembly");
9183     required_fields.push_back("collection_date");
9184     required_fields.push_back("country");
9185     required_fields.push_back("depth");
9186     required_fields.push_back("environment");
9187     required_fields.push_back("investigation_type");
9188     required_fields.push_back("isol_growth_condt");
9189     required_fields.push_back("lat_lon");
9190     required_fields.push_back("project_name");
9191     required_fields.push_back("sequencing_meth");
9192 
9193     ITERATE(vector<string>, it, required_fields) {
9194         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommMissingField",
9195                                   "Required field " + *it + " is missing"));
9196     }
9197     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9198     //AddChromosomeNoLocation(expected_errors, entry);
9199 
9200     eval = validator.Validate(seh, options);
9201     CheckErrors (*eval, expected_errors);
9202 
9203     CLEAR_ERRORS
9204 
9205     prefix_field->SetData().SetStr("##MIGS:4.0-Data-START##");
9206     required_fields.clear();
9207     required_fields.push_back("assembly");
9208     required_fields.push_back("collection_date");
9209     required_fields.push_back("env_biome");
9210     required_fields.push_back("env_feature");
9211     required_fields.push_back("env_material");
9212     required_fields.push_back("env_package");
9213     required_fields.push_back("geo_loc_name");
9214     required_fields.push_back("investigation_type");
9215     required_fields.push_back("isol_growth_condt");
9216     required_fields.push_back("lat_lon");
9217     required_fields.push_back("project_name");
9218     required_fields.push_back("seq_meth");
9219 
9220     ITERATE(vector<string>, it, required_fields) {
9221         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommMissingField",
9222                                   "Required field " + *it + " is missing"));
9223     }
9224     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9225     //AddChromosomeNoLocation(expected_errors, entry);
9226 
9227     eval = validator.Validate(seh, options);
9228     CheckErrors (*eval, expected_errors);
9229 
9230     CLEAR_ERRORS
9231 
9232     // should complain about missing required field for specific values of sequencing technology
9233     prefix_field->SetData().SetStr("##Assembly-Data-START##");
9234     desc->SetUser().ResetData();
9235     desc->SetUser().SetData().push_back(prefix_field);
9236 
9237     CRef<CUser_field> field(new CUser_field());
9238     field->SetLabel().SetStr("Sequencing Technology");
9239     field->SetData().SetStr("Singer");
9240     desc->SetUser().SetData().push_back(field);
9241 
9242     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadStrucCommMissingField",
9243                                   "Required field Assembly Method is missing when Sequencing Technology has value 'Singer'"));
9244     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9245     //AddChromosomeNoLocation(expected_errors, entry);
9246 
9247     eval = validator.Validate(seh, options);
9248     CheckErrors (*eval, expected_errors);
9249 
9250     CLEAR_ERRORS
9251 
9252     field->SetData().SetStr("something else");
9253     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadStrucCommMissingField",
9254                                   "Required field Assembly Method is missing when Sequencing Technology has value 'something else'"));
9255     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9256     //AddChromosomeNoLocation(expected_errors, entry);
9257 
9258     eval = validator.Validate(seh, options);
9259     CheckErrors (*eval, expected_errors);
9260 
9261     CLEAR_ERRORS
9262 
9263     prefix_field->SetData().SetStr("##HumanSTR-START##");
9264 
9265     eval = validator.Validate(seh, options);
9266 
9267     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadStrucCommMissingField",
9268         "Required field STR locus name is missing"));
9269     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadStrucCommMissingField",
9270         "Required field Length-based allele is missing"));
9271     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadStrucCommMissingField",
9272         "Required field Bracketed repeat is missing"));
9273     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9274     //AddChromosomeNoLocation(expected_errors, entry);
9275     CheckErrors(*eval, expected_errors);
9276 
9277     CLEAR_ERRORS
9278 }
9279 
9280 
MkField(const string & label,const string & val)9281 CRef<CUser_field> MkField(const string& label, const string& val)
9282 {
9283     CRef<CUser_field> f(new CUser_field());
9284     f->SetLabel().SetStr(label);
9285     f->SetData().SetStr(val);
9286     return f;
9287 }
9288 
9289 
BOOST_AUTO_TEST_CASE(Test_VR_709)9290 BOOST_AUTO_TEST_CASE(Test_VR_709)
9291 {
9292     // prepare entry
9293     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
9294     CRef<CUser_object> user(new CUser_object());
9295     user->SetType().SetStr("StructuredComment");
9296     user->SetData().push_back(MkField("StructuredCommentPrefix", "##Genome-Assembly-Data-START##"));
9297     user->SetData().push_back(MkField("Assembly Method", "a v. b"));
9298     user->SetData().push_back(MkField("Assembly Name", "NCBI1234"));
9299     user->SetData().push_back(MkField("Genome Coverage", "1"));
9300     user->SetData().push_back(MkField("Sequencing Technology", "2"));
9301 
9302     CRef<CSeqdesc> desc(new CSeqdesc());
9303     desc->SetUser().Assign(*user);
9304     entry->SetSeq().SetDescr().Set().push_back(desc);
9305 
9306     STANDARD_SETUP
9307 
9308     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadAssemblyName",
9309             "Assembly Name should not start with 'NCBI' or 'GenBank' in structured comment"));
9310     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9311     //AddChromosomeNoLocation(expected_errors, entry);
9312 
9313     eval = validator.Validate(seh, options);
9314 
9315     CheckErrors(*eval, expected_errors);
9316 
9317     CLEAR_ERRORS
9318 }
9319 
9320 
BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceNeedsChromosome)9321 BOOST_AUTO_TEST_CASE(Test_Descr_BioSourceNeedsChromosome)
9322 {
9323     // prepare entry
9324     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
9325     unit_test_util::SetBiomol (entry, CMolInfo::eBiomol_genomic);
9326     unit_test_util::SetCompleteness (entry, CMolInfo::eCompleteness_complete);
9327     SetTitle (entry, "Sebaea microphylla, complete genome.");
9328 
9329     STANDARD_SETUP
9330     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BioSourceNeedsChromosome",
9331                               "Non-viral complete genome not labeled as chromosome"));
9332     //AddChromosomeNoLocation(expected_errors, entry);
9333 
9334     eval = validator.Validate(seh, options);
9335     CheckErrors (*eval, expected_errors);
9336 
9337     CLEAR_ERRORS
9338 
9339     //AddChromosomeNoLocation(expected_errors, entry);
9340 
9341     // error goes away if viruses in lineage
9342     unit_test_util::SetLineage(entry, "Viruses; ");
9343     eval = validator.Validate(seh, options);
9344     CheckErrors (*eval, expected_errors);
9345     unit_test_util::SetLineage(entry, "some lineage");
9346 
9347     // if not genomic
9348     unit_test_util::SetBiomol (entry, CMolInfo::eBiomol_mRNA);
9349     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
9350     eval = validator.Validate(seh, options);
9351     CheckErrors (*eval, expected_errors);
9352     unit_test_util::SetBiomol (entry, CMolInfo::eBiomol_genomic);
9353 
9354     // if not end with complete genome
9355     SetTitle (entry, "Sebaea microphylla, complete sequence.");
9356     eval = validator.Validate(seh, options);
9357     CheckErrors (*eval, expected_errors);
9358     SetTitle (entry, "Sebaea microphylla, complete genome.");
9359 
9360     // if source location chromosome
9361     CLEAR_ERRORS
9362     unit_test_util::SetGenome (entry, CBioSource::eGenome_chromosome);
9363     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "ChromosomeLocation",
9364                    "INDEXER_ONLY - BioSource location is chromosome"));
9365     eval = validator.Validate(seh, options);
9366     CheckErrors (*eval, expected_errors);
9367 
9368     CLEAR_ERRORS
9369 }
9370 
9371 
BOOST_AUTO_TEST_CASE(Test_Descr_MolInfoConflictsWithBioSource)9372 BOOST_AUTO_TEST_CASE(Test_Descr_MolInfoConflictsWithBioSource)
9373 {
9374     // prepare entry
9375     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
9376     // test for single-strand RNA viruses
9377     unit_test_util::SetLineage (entry, "Viruses; Avsunviroidae; foo");
9378 
9379     STANDARD_SETUP
9380     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MolInfoConflictsWithBioSource",
9381                               "Taxonomy indicates single-stranded RNA, molecule type (DNA) is conflicting."));
9382     //AddChromosomeNoLocation(expected_errors, entry);
9383 
9384     eval = validator.Validate(seh, options);
9385     CheckErrors (*eval, expected_errors);
9386 
9387     unit_test_util::SetLineage (entry, "Viruses; Deltavirus; foo");
9388     eval = validator.Validate(seh, options);
9389     CheckErrors (*eval, expected_errors);
9390 
9391     unit_test_util::SetLineage (entry, "Viruses; Arenaviridae; foo");
9392     eval = validator.Validate(seh, options);
9393     CheckErrors (*eval, expected_errors);
9394 
9395     CLEAR_ERRORS
9396     // expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentVirusMoltype",
9397     //                           "Genomic DNA viral lineage indicates no DNA stage"));
9398     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MolInfoConflictsWithBioSource",
9399                               "Taxonomy indicates single-stranded RNA, molecule type (DNA) is conflicting."));
9400     //AddChromosomeNoLocation(expected_errors, entry);
9401 
9402     unit_test_util::SetLineage (entry, "Viruses; Albetovirus; foo");
9403     eval = validator.Validate(seh, options);
9404     CheckErrors (*eval, expected_errors);
9405 
9406     // error should go away if mol is rna
9407     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
9408     CLEAR_ERRORS
9409     //AddChromosomeNoLocation(expected_errors, entry);
9410     eval = validator.Validate(seh, options);
9411     CheckErrors (*eval, expected_errors);
9412 
9413     // tests for double-stranded RNA viruses
9414     unit_test_util::SetLineage (entry, "Viruses; Amalgaviridae; foo");
9415     // should be no error because rna
9416     eval = validator.Validate(seh, options);
9417     CheckErrors (*eval, expected_errors);
9418     // error if not rna
9419     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
9420     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MolInfoConflictsWithBioSource",
9421                               "Taxonomy indicates double-stranded RNA, molecule type (DNA) is conflicting."));
9422     eval = validator.Validate(seh, options);
9423     CheckErrors (*eval, expected_errors);
9424 
9425     // test for single-stranded DNS viruses
9426     unit_test_util::SetLineage (entry, "Viruses; Alphasatellitidae; foo");
9427     // no errors because is dna
9428     CLEAR_ERRORS
9429     //AddChromosomeNoLocation(expected_errors, entry);
9430     eval = validator.Validate(seh, options);
9431     CheckErrors (*eval, expected_errors);
9432     // error if not dna
9433     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
9434     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MolInfoConflictsWithBioSource",
9435                               "Taxonomy indicates single-stranded DNA, molecule type (RNA) is conflicting."));
9436     eval = validator.Validate(seh, options);
9437     CheckErrors (*eval, expected_errors);
9438 
9439     // test for double-stranded DNS viruses
9440     unit_test_util::SetLineage (entry, "Viruses; Hepadnaviridae; foo");
9441     // error because not dna
9442     expected_errors.back()->SetErrMsg("Taxonomy indicates double-stranded DNA, molecule type (RNA) is conflicting.");
9443     eval = validator.Validate(seh, options);
9444     CheckErrors (*eval, expected_errors);
9445     //no error if dna
9446     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_dna);
9447     CLEAR_ERRORS
9448     //AddChromosomeNoLocation(expected_errors, entry);
9449     eval = validator.Validate(seh, options);
9450     CheckErrors (*eval, expected_errors);
9451     CLEAR_ERRORS
9452 }
9453 
9454 
BOOST_AUTO_TEST_CASE(Test_Descr_MissingKeyword)9455 BOOST_AUTO_TEST_CASE(Test_Descr_MissingKeyword)
9456 {
9457     // prepare entry
9458     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
9459     CRef<CSeqdesc> sdesc(new CSeqdesc());
9460     sdesc->SetUser().SetType().SetStr("StructuredComment");
9461     entry->SetSeq().SetDescr().Set().push_back(sdesc);
9462 
9463     sdesc->SetUser().AddField("StructuredCommentPrefix", "##MIGS-Data-START##", CUser_object::eParse_String);
9464     sdesc->SetUser().AddField("alt_elev", "foo", CUser_object::eParse_String);
9465     sdesc->SetUser().AddField("assembly", "foo", CUser_object::eParse_String);
9466     sdesc->SetUser().AddField("collection_date", "foo", CUser_object::eParse_String);
9467     sdesc->SetUser().AddField("country", "foo", CUser_object::eParse_String);
9468     sdesc->SetUser().AddField("depth", "foo", CUser_object::eParse_String);
9469     sdesc->SetUser().AddField("environment", "foo", CUser_object::eParse_String);
9470     sdesc->SetUser().AddField("investigation_type", "eukaryote", CUser_object::eParse_String);
9471     sdesc->SetUser().AddField("isol_growth_condt", "foo", CUser_object::eParse_String);
9472     sdesc->SetUser().AddField("sequencing_meth", "foo", CUser_object::eParse_String);
9473     sdesc->SetUser().AddField("project_name", "foo", CUser_object::eParse_String);
9474     sdesc->SetUser().AddField("ploidy", "foo", CUser_object::eParse_String);
9475     sdesc->SetUser().AddField("num_replicons", "foo", CUser_object::eParse_String);
9476     sdesc->SetUser().AddField("estimated_size", "foo", CUser_object::eParse_String);
9477     sdesc->SetUser().AddField("trophic_level", "foo", CUser_object::eParse_String);
9478     sdesc->SetUser().AddField("propagation", "foo", CUser_object::eParse_String);
9479     sdesc->SetUser().AddField("lat_lon", "foo", CUser_object::eParse_String);
9480 
9481     CRef<CSeqdesc> gdesc(new CSeqdesc());
9482     gdesc->SetGenbank().SetKeywords().push_back("GSC:MIGS:2.1");
9483     entry->SetSeq().SetDescr().Set().push_back(gdesc);
9484 
9485     STANDARD_SETUP
9486 
9487     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadKeywordForStrucComm",
9488                                                  "Structured Comment is non-compliant, keyword should be removed"));
9489     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommMissingField",
9490                                                  "Required field finishing_strategy is missing when investigation_type has value 'eukaryote'"));
9491     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "BadStrucCommInvalidFieldValue", "Structured Comment invalid; the field value and/or name are incorrect"));
9492     //AddChromosomeNoLocation(expected_errors, entry);
9493     eval = validator.Validate(seh, options);
9494     CheckErrors (*eval, expected_errors);
9495 
9496     // if no keyword, no badkeyword error
9497     entry->SetSeq().SetDescr().Set().pop_back();
9498     delete expected_errors[0];
9499     expected_errors[0] = NULL;
9500     eval = validator.Validate(seh, options);
9501     CheckErrors (*eval, expected_errors);
9502 
9503     CLEAR_ERRORS
9504 
9505     // make the comment valid, should complain about missing keyword
9506     sdesc->SetUser().AddField("finishing_strategy", "foo", CUser_object::eParse_String);
9507     //AddChromosomeNoLocation(expected_errors, entry);
9508     eval = validator.Validate(seh, options);
9509     CheckErrors (*eval, expected_errors);
9510 
9511     CLEAR_ERRORS
9512     // put keyword back, should have no errors
9513     entry->SetSeq().SetDescr().Set().push_back(gdesc);
9514     //AddChromosomeNoLocation(expected_errors, entry);
9515     eval = validator.Validate(seh, options);
9516     CheckErrors (*eval, expected_errors);
9517     CLEAR_ERRORS
9518 }
9519 
9520 
BOOST_AUTO_TEST_CASE(Test_Descr_FakeStructuredComment)9521 BOOST_AUTO_TEST_CASE(Test_Descr_FakeStructuredComment)
9522 {
9523     // prepare entry
9524     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
9525     CRef<CSeqdesc> sdesc(new CSeqdesc());
9526     sdesc->SetComment("This comment contains ::");
9527     entry->SetSeq().SetDescr().Set().push_back(sdesc);
9528 
9529     STANDARD_SETUP
9530 
9531     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "FakeStructuredComment",
9532                                                  "Comment may be formatted to look like a structured comment."));
9533     //AddChromosomeNoLocation(expected_errors, entry);
9534     eval = validator.Validate(seh, options);
9535     CheckErrors (*eval, expected_errors);
9536 
9537     CLEAR_ERRORS
9538 }
9539 
9540 
BOOST_AUTO_TEST_CASE(Test_Descr_StructuredCommentPrefixOrSuffixMissing)9541 BOOST_AUTO_TEST_CASE(Test_Descr_StructuredCommentPrefixOrSuffixMissing)
9542 {
9543     // prepare entry
9544     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
9545     CRef<CSeqdesc> sdesc(new CSeqdesc());
9546     sdesc->SetUser().SetType().SetStr("StructuredComment");
9547     entry->SetSeq().SetDescr().Set().push_back(sdesc);
9548 
9549     sdesc->SetUser().AddField("OneField", "some value", CUser_object::eParse_String);
9550     STANDARD_SETUP
9551 
9552     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "StrucCommMissingPrefixOrSuffix",
9553                                                  "Structured Comment lacks prefix and/or suffix"));
9554     //AddChromosomeNoLocation(expected_errors, entry);
9555     eval = validator.Validate(seh, options);
9556     CheckErrors (*eval, expected_errors);
9557     CLEAR_ERRORS
9558 }
9559 
9560 
BOOST_AUTO_TEST_CASE(Test_Generic_NonAsciiAsn)9561 BOOST_AUTO_TEST_CASE(Test_Generic_NonAsciiAsn)
9562 {
9563     // prepare entry
9564     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
9565 
9566     CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
9567     CScope scope(*objmgr);
9568     scope.AddDefaults();
9569     CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
9570     CConstRef<CValidError> eval;
9571     CValidator validator(*objmgr);
9572     unsigned int options = CValidator::eVal_need_isojta
9573                           | CValidator::eVal_far_fetch_mrna_products
9574                           | CValidator::eVal_validate_id_set | CValidator::eVal_indexer_version
9575                           | CValidator::eVal_use_entrez
9576                           | CValidator::eVal_non_ascii;
9577     vector< CExpectedError *> expected_errors;
9578 
9579     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Fatal, "NonAsciiAsn",
9580                               "Non-ascii chars in input ASN.1 strings"));
9581     //AddChromosomeNoLocation(expected_errors, entry);
9582     eval = validator.Validate(seh, options);
9583     CheckErrors (*eval, expected_errors);
9584 
9585     // error should only appear once
9586     scope.RemoveTopLevelSeqEntry(seh);
9587     entry = unit_test_util::BuildGoodNucProtSet();
9588     seh = scope.AddTopLevelSeqEntry(*entry);
9589     ChangeErrorAcc(expected_errors, "lcl|nuc");
9590     eval = validator.Validate(seh, options);
9591     CheckErrors (*eval, expected_errors);
9592 
9593     CLEAR_ERRORS
9594 }
9595 
9596 
BOOST_AUTO_TEST_CASE(Test_SEQ_DESCR_MissingPersonalCollectionName)9597 BOOST_AUTO_TEST_CASE(Test_SEQ_DESCR_MissingPersonalCollectionName)
9598 {
9599     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
9600     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_bio_material, "personal:1234");
9601 
9602     STANDARD_SETUP
9603     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingPersonalCollectionName",
9604                               "Personal collection does not have name of collector"));
9605     //AddChromosomeNoLocation(expected_errors, entry);
9606     eval = validator.Validate(seh, options);
9607     CheckErrors (*eval, expected_errors);
9608 
9609     CLEAR_ERRORS
9610 }
9611 
9612 
BOOST_AUTO_TEST_CASE(Test_Generic_AuthorListHasEtAl)9613 BOOST_AUTO_TEST_CASE(Test_Generic_AuthorListHasEtAl)
9614 {
9615     // prepare entry
9616     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
9617     CRef<CAuthor> author(new CAuthor());
9618     author->SetName().SetName().SetLast("et al.");
9619     CRef<CPub> pub(new CPub());
9620     pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author);
9621     CRef<CCit_art::TTitle::C_E> art_title(new CCit_art::TTitle::C_E());
9622     art_title->SetName("article title");
9623     pub->SetArticle().SetTitle().Set().push_back(art_title);
9624     CRef<CSeqdesc> desc(new CSeqdesc());
9625     desc->SetPub().SetPub().Set().push_back(pub);
9626     entry->SetDescr().Set().push_back(desc);
9627 
9628     STANDARD_SETUP
9629 
9630     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "AuthorListHasEtAl",
9631                               "Author list ends in et al."));
9632     //AddChromosomeNoLocation(expected_errors, entry);
9633     eval = validator.Validate(seh, options);
9634     CheckErrors (*eval, expected_errors);
9635 
9636     pub->SetMan().SetCit().SetAuthors().SetNames().SetStd().push_back(author);
9637     CRef<CCit_book::TTitle::C_E> book_title(new CCit_book::TTitle::C_E());
9638     book_title->SetName("book title");
9639     pub->SetMan().SetCit().SetTitle().Set().push_back(book_title);
9640     eval = validator.Validate(seh, options);
9641     CheckErrors (*eval, expected_errors);
9642 
9643     pub->SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9644     pub->SetBook().SetTitle().Set().push_back(book_title);
9645     eval = validator.Validate(seh, options);
9646     CheckErrors (*eval, expected_errors);
9647 
9648     pub->SetProc().SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9649     pub->SetProc().SetBook().SetTitle().Set().push_back(book_title);
9650     eval = validator.Validate(seh, options);
9651     CheckErrors (*eval, expected_errors);
9652 
9653     pub->SetGen().SetAuthors().SetNames().SetStd().push_back(author);
9654     pub->SetGen().SetTitle("gen title");
9655     pub->SetGen().SetDate().SetStd().SetYear(2009);
9656     eval = validator.Validate(seh, options);
9657     CheckErrors (*eval, expected_errors);
9658 
9659     pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author);
9660     pub->SetSub().SetAuthors().SetAffil().SetStr("some affiliation");
9661 
9662     pub->SetSub().SetDate().SetStd().SetYear(2009);
9663     pub->SetSub().SetDate().SetStd().SetMonth(12);
9664     pub->SetSub().SetDate().SetStd().SetDay(31);
9665 
9666     eval = validator.Validate(seh, options);
9667     CheckErrors (*eval, expected_errors);
9668 
9669     // try as pub feature
9670     scope.RemoveTopLevelSeqEntry(seh);
9671     entry->SetDescr().Set().pop_back();
9672     CRef<CSeq_feat> feat(new CSeq_feat());
9673     feat->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
9674     feat->SetLocation().SetInt().SetFrom(0);
9675     feat->SetLocation().SetInt().SetTo(10);
9676     feat->SetData().SetPub().SetPub().Set().push_back(pub);
9677     CRef<CSeq_annot> annot(new CSeq_annot());
9678     annot->SetData().SetFtable().push_back(feat);
9679     entry->SetSeq().SetAnnot().push_back(annot);
9680     seh = scope.AddTopLevelSeqEntry(*entry);
9681 
9682     pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author);
9683     pub->SetArticle().SetTitle().Set().push_back(art_title);
9684     eval = validator.Validate(seh, options);
9685     CheckErrors (*eval, expected_errors);
9686 
9687     pub->SetMan().SetCit().SetAuthors().SetNames().SetStd().push_back(author);
9688     pub->SetMan().SetCit().SetTitle().Set().push_back(book_title);
9689     eval = validator.Validate(seh, options);
9690     CheckErrors (*eval, expected_errors);
9691 
9692     pub->SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9693     pub->SetBook().SetTitle().Set().push_back(book_title);
9694     eval = validator.Validate(seh, options);
9695     CheckErrors (*eval, expected_errors);
9696 
9697     pub->SetProc().SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9698     pub->SetProc().SetBook().SetTitle().Set().push_back(book_title);
9699     eval = validator.Validate(seh, options);
9700     CheckErrors (*eval, expected_errors);
9701 
9702     pub->SetGen().SetAuthors().SetNames().SetStd().push_back(author);
9703     pub->SetGen().SetTitle("gen title");
9704     pub->SetGen().SetDate().SetStd().SetYear(2009);
9705     eval = validator.Validate(seh, options);
9706     CheckErrors (*eval, expected_errors);
9707 
9708     pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author);
9709     pub->SetSub().SetAuthors().SetAffil().SetStr("some affiliation");
9710 
9711     pub->SetSub().SetDate().SetStd().SetYear(2009);
9712     pub->SetSub().SetDate().SetStd().SetMonth(12);
9713     pub->SetSub().SetDate().SetStd().SetDay(31);
9714 
9715     eval = validator.Validate(seh, options);
9716     CheckErrors (*eval, expected_errors);
9717 
9718     // look for contains instead of ends with
9719     scope.RemoveTopLevelSeqEntry(seh);
9720     entry->SetSeq().SetAnnot().pop_back();
9721     entry->SetDescr().Set().push_back(desc);
9722     seh = scope.AddTopLevelSeqEntry(*entry);
9723 
9724     expected_errors[0]->SetErrMsg("Author list contains et al.");
9725     CRef<CAuthor> author2 = unit_test_util::BuildGoodAuthor();
9726 
9727     pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author);
9728     pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author2);
9729     pub->SetArticle().SetTitle().Set().push_back(art_title);
9730     eval = validator.Validate(seh, options);
9731     CheckErrors (*eval, expected_errors);
9732 
9733     pub->SetMan().SetCit().SetAuthors().SetNames().SetStd().push_back(author);
9734     pub->SetMan().SetCit().SetAuthors().SetNames().SetStd().push_back(author2);
9735     pub->SetMan().SetCit().SetTitle().Set().push_back(book_title);
9736     eval = validator.Validate(seh, options);
9737     CheckErrors (*eval, expected_errors);
9738 
9739     pub->SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9740     pub->SetBook().SetAuthors().SetNames().SetStd().push_back(author2);
9741     pub->SetBook().SetTitle().Set().push_back(book_title);
9742     eval = validator.Validate(seh, options);
9743     CheckErrors (*eval, expected_errors);
9744 
9745     pub->SetProc().SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9746     pub->SetProc().SetBook().SetAuthors().SetNames().SetStd().push_back(author2);
9747     pub->SetProc().SetBook().SetTitle().Set().push_back(book_title);
9748     eval = validator.Validate(seh, options);
9749     CheckErrors (*eval, expected_errors);
9750 
9751     pub->SetGen().SetAuthors().SetNames().SetStd().push_back(author);
9752     pub->SetGen().SetAuthors().SetNames().SetStd().push_back(author2);
9753     pub->SetGen().SetTitle("gen title");
9754     pub->SetGen().SetDate().SetStd().SetYear(2009);
9755     eval = validator.Validate(seh, options);
9756     CheckErrors (*eval, expected_errors);
9757 
9758     pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author);
9759     pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author2);
9760     pub->SetSub().SetAuthors().SetAffil().SetStr("some affiliation");
9761 
9762     pub->SetSub().SetDate().SetStd().SetYear(2009);
9763     pub->SetSub().SetDate().SetStd().SetMonth(12);
9764     pub->SetSub().SetDate().SetStd().SetDay(31);
9765 
9766     eval = validator.Validate(seh, options);
9767     CheckErrors (*eval, expected_errors);
9768 
9769     // try as pub feature
9770     scope.RemoveTopLevelSeqEntry(seh);
9771     entry->SetDescr().Set().pop_back();
9772     entry->SetSeq().SetAnnot().push_back(annot);
9773     seh = scope.AddTopLevelSeqEntry(*entry);
9774 
9775     pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author);
9776     pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author2);
9777     pub->SetArticle().SetTitle().Set().push_back(art_title);
9778     eval = validator.Validate(seh, options);
9779     CheckErrors (*eval, expected_errors);
9780 
9781     pub->SetMan().SetCit().SetAuthors().SetNames().SetStd().push_back(author);
9782     pub->SetMan().SetCit().SetAuthors().SetNames().SetStd().push_back(author2);
9783     pub->SetMan().SetCit().SetTitle().Set().push_back(book_title);
9784     eval = validator.Validate(seh, options);
9785     CheckErrors (*eval, expected_errors);
9786 
9787     pub->SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9788     pub->SetBook().SetAuthors().SetNames().SetStd().push_back(author2);
9789     pub->SetBook().SetTitle().Set().push_back(book_title);
9790     eval = validator.Validate(seh, options);
9791     CheckErrors (*eval, expected_errors);
9792 
9793     pub->SetProc().SetBook().SetAuthors().SetNames().SetStd().push_back(author);
9794     pub->SetProc().SetBook().SetAuthors().SetNames().SetStd().push_back(author2);
9795     pub->SetProc().SetBook().SetTitle().Set().push_back(book_title);
9796     eval = validator.Validate(seh, options);
9797     CheckErrors (*eval, expected_errors);
9798 
9799     pub->SetGen().SetAuthors().SetNames().SetStd().push_back(author);
9800     pub->SetGen().SetAuthors().SetNames().SetStd().push_back(author2);
9801     pub->SetGen().SetTitle("gen title");
9802     pub->SetGen().SetDate().SetStd().SetYear(2009);
9803     eval = validator.Validate(seh, options);
9804     CheckErrors (*eval, expected_errors);
9805 
9806     pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author);
9807     pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author2);
9808     pub->SetSub().SetAuthors().SetAffil().SetStr("some affiliation");
9809 
9810     pub->SetSub().SetDate().SetStd().SetYear(2009);
9811     pub->SetSub().SetDate().SetStd().SetMonth(12);
9812     pub->SetSub().SetDate().SetStd().SetDay(31);
9813 
9814     eval = validator.Validate(seh, options);
9815     CheckErrors (*eval, expected_errors);
9816 
9817     CLEAR_ERRORS
9818 }
9819 
9820 
BOOST_AUTO_TEST_CASE(Test_Generic_MissingPubRequirement)9821 BOOST_AUTO_TEST_CASE(Test_Generic_MissingPubRequirement)
9822 {
9823     // validate cit-sub
9824     CRef<CSeq_submit> submit(new CSeq_submit());
9825 
9826     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
9827     submit->SetData().SetEntrys().push_back(entry);
9828     CRef<CAuthor> author = unit_test_util::BuildGoodAuthor();
9829     submit->SetSub().SetCit().SetAuthors().SetNames().SetStd().push_back(author);
9830     submit->SetSub().SetCit().SetAuthors().SetAffil().SetStd().SetAffil("some affiliation");
9831 
9832     submit->SetSub().SetCit().SetDate().SetStd().SetYear(2009);
9833     submit->SetSub().SetCit().SetDate().SetStd().SetMonth(12);
9834     submit->SetSub().SetCit().SetDate().SetStd().SetDay(31);
9835 
9836     STANDARD_SETUP
9837 
9838     vector<string> ids;
9839     ids.push_back("good");
9840     ids.push_back("NC_123456");
9841 
9842     ITERATE(vector<string>, id_it, ids) {
9843         EDiagSev sev = eDiag_Warning;
9844         scope.RemoveTopLevelSeqEntry(seh);
9845         if (NStr::StartsWith(*id_it, "NC_")) {
9846             entry->SetSeq().SetId().front()->SetOther().SetAccession(*id_it);
9847         } else {
9848             entry->SetSeq().SetId().front()->SetLocal().SetStr(*id_it);
9849             sev = eDiag_Critical;
9850         }
9851         seh = scope.AddTopLevelSeqEntry(*entry);
9852 
9853         submit->SetSub().SetCit().SetAuthors().ResetAffil();
9854         submit->SetSub().SetCit().SetAuthors().SetAffil().SetStd().SetAffil("some affiliation");
9855         submit->SetSub().ResetContact();
9856         string msg_acc = NStr::StartsWith(*id_it, "NC") ? "ref|" + *id_it + "|" : "lcl|" + *id_it;
9857         expected_errors.push_back(new CExpectedError(msg_acc,
9858                                   sev, "MissingPubRequirement",
9859                                   "Submission citation affiliation has no country"));
9860         //AddChromosomeNoLocation(expected_errors, entry);
9861         eval = validator.Validate(*submit, &scope, options);
9862         CheckErrors (*eval, expected_errors);
9863 
9864         submit->SetSub().SetCit().SetAuthors().SetAffil().SetStd().SetCountry("USA");
9865         expected_errors[0]->SetErrMsg("Submission citation affiliation has no state");
9866         expected_errors[0]->SetSeverity(eDiag_Warning);
9867         eval = validator.Validate(*submit, &scope, options);
9868         CheckErrors (*eval, expected_errors);
9869         CLEAR_ERRORS
9870 
9871         submit->SetSub().SetCit().SetAuthors().SetAffil().SetStd().SetSub("VA");
9872         submit->SetSub().SetContact().SetContact().SetAffil().SetStd().SetAffil("some affiliation");
9873         expected_errors.push_back(new CExpectedError(msg_acc, sev, "MissingPubRequirement",
9874                                   "Submission citation affiliation has no country"));
9875         expected_errors[0]->SetAccession("");
9876         expected_errors[0]->SetSeverity(eDiag_Warning);
9877         //AddChromosomeNoLocation(expected_errors, entry);
9878         eval = validator.Validate(*submit, &scope, options);
9879         CheckErrors (*eval, expected_errors);
9880 
9881         submit->SetSub().SetContact().SetContact().SetAffil().SetStd().SetCountry("USA");
9882         expected_errors[0]->SetErrMsg("Submission citation affiliation has no state");
9883         expected_errors[0]->SetSeverity(eDiag_Warning);
9884         eval = validator.Validate(*submit, &scope, options);
9885         CheckErrors (*eval, expected_errors);
9886         CLEAR_ERRORS
9887 
9888         scope.RemoveTopLevelSeqEntry(seh);
9889         CRef<CPub> pub(new CPub());
9890         CRef<CSeqdesc> desc(new CSeqdesc());
9891         desc->SetPub().SetPub().Set().push_back(pub);
9892         entry->SetDescr().Set().push_back(desc);
9893         pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author);
9894         pub->SetSub().SetAuthors().SetAffil().SetStd().SetAffil("some affiliation");
9895 
9896         pub->SetSub().SetDate().SetStd().SetYear(2009);
9897         pub->SetSub().SetDate().SetStd().SetMonth(12);
9898         pub->SetSub().SetDate().SetStd().SetDay(31);
9899 
9900         seh = scope.AddTopLevelSeqEntry(*entry);
9901 
9902         expected_errors.push_back(new CExpectedError(msg_acc, sev, "MissingPubRequirement",
9903                                   "Submission citation affiliation has no country"));
9904         //AddChromosomeNoLocation(expected_errors, entry);
9905         eval = validator.Validate(seh, options);
9906         CheckErrors (*eval, expected_errors);
9907 
9908         pub->SetSub().SetAuthors().SetAffil().SetStd().SetCountry("USA");
9909         expected_errors[0]->SetErrMsg("Submission citation affiliation has no state");
9910         expected_errors[0]->SetSeverity(eDiag_Warning);
9911         eval = validator.Validate(seh, options);
9912         CheckErrors (*eval, expected_errors);
9913 
9914         pub->SetSub().SetAuthors().SetAffil().SetStd().SetSub("VA");
9915         pub->SetSub().SetAuthors().SetNames().SetStd().pop_back();
9916 
9917         expected_errors[0]->SetErrMsg("Submission citation has no author names");
9918         expected_errors[0]->SetSeverity(eDiag_Critical);
9919         eval = validator.Validate(seh, options);
9920         CheckErrors (*eval, expected_errors);
9921         CLEAR_ERRORS
9922 
9923         pub->SetSub().SetAuthors().SetNames().SetStd().push_back(author);
9924         pub->SetSub().SetAuthors().SetAffil().SetStd().ResetCountry();
9925         pub->SetSub().SetAuthors().SetAffil().SetStd().ResetSub();
9926         pub->SetSub().SetAuthors().SetAffil().SetStd().ResetAffil();
9927         expected_errors.push_back(new CExpectedError(msg_acc,
9928                 NStr::StartsWith(*id_it, "NC_") ? eDiag_Warning : eDiag_Critical,
9929                                   "MissingPubRequirement",
9930                                   "Submission citation has no affiliation"));
9931         //AddChromosomeNoLocation(expected_errors, entry);
9932         eval = validator.Validate(seh, options);
9933         CheckErrors (*eval, expected_errors);
9934 
9935         pub->SetSub().SetAuthors().ResetAffil();
9936         eval = validator.Validate(seh, options);
9937         CheckErrors (*eval, expected_errors);
9938 
9939         SetTech(entry, CMolInfo::eTech_htgs_0);
9940         expected_errors[0]->SetSeverity(eDiag_Warning);
9941         eval = validator.Validate(seh, options);
9942         CheckErrors (*eval, expected_errors);
9943         SetTech(entry, CMolInfo::eTech_htgs_1);
9944         eval = validator.Validate(seh, options);
9945         CheckErrors (*eval, expected_errors);
9946         SetTech(entry, CMolInfo::eTech_htgs_3);
9947         eval = validator.Validate(seh, options);
9948         CheckErrors (*eval, expected_errors);
9949         SetTech(entry, CMolInfo::eTech_unknown);
9950 
9951         CLEAR_ERRORS
9952 
9953         pub->SetGen().SetAuthors().SetNames().SetStd().push_back(author);
9954         pub->SetGen().SetCit("Does not start with expected text");
9955 
9956         expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Error, "MissingPubRequirement",
9957             "Unpublished citation text invalid"));
9958         expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "MissingPubRequirement",
9959             "Publication date missing"));
9960         //AddChromosomeNoLocation(expected_errors, entry);
9961 
9962         eval = validator.Validate(seh, options);
9963         CheckErrors (*eval, expected_errors);
9964 
9965         delete expected_errors[1];
9966         expected_errors[1] = NULL;
9967 
9968         pub->SetGen().SetCit("submitted starts with expected text");
9969         pub->SetGen().SetDate().SetStr("?");
9970         expected_errors[0]->SetErrMsg("Publication date marked as '?'");
9971         expected_errors[0]->SetSeverity(eDiag_Warning);
9972         eval = validator.Validate(seh, options);
9973         CheckErrors (*eval, expected_errors);
9974 
9975         pub->SetGen().SetDate().SetStd().SetYear(0);
9976         expected_errors[0]->SetErrMsg("Publication date not set");
9977         eval = validator.Validate(seh, options);
9978         CheckErrors (*eval, expected_errors);
9979 
9980         pub->SetGen().ResetDate();
9981         pub->SetGen().SetAuthors().SetNames().SetStd().pop_back();
9982         if (!NStr::StartsWith(*id_it, "NC_")) {
9983             expected_errors[0]->SetSeverity(eDiag_Error);
9984         }
9985         expected_errors[0]->SetErrMsg("Publication has no author names");
9986         eval = validator.Validate(seh, options);
9987         CheckErrors (*eval, expected_errors);
9988 
9989         pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author);
9990         expected_errors[0]->SetSeverity(eDiag_Error);
9991         expected_errors[0]->SetErrMsg("Publication has no title");
9992         eval = validator.Validate(seh, options);
9993         CheckErrors (*eval, expected_errors);
9994 
9995         CRef<CCit_art::TTitle::C_E> art_title(new CCit_art::TTitle::C_E());
9996         art_title->SetName("article title");
9997         pub->SetArticle().SetTitle().Set().push_back(art_title);
9998         pub->SetArticle().SetAuthors().SetNames().SetStd().pop_back();
9999         expected_errors[0]->SetErrMsg("Publication has no author names");
10000         if (NStr::StartsWith(*id_it, "NC_")) {
10001             expected_errors[0]->SetSeverity(eDiag_Warning);
10002         }
10003         eval = validator.Validate(seh, options);
10004         CheckErrors (*eval, expected_errors);
10005 
10006         pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author);
10007         pub->SetArticle().SetFrom().SetJournal().SetImp().SetVolume("vol 1");
10008         pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("14-32");
10009         pub->SetArticle().SetFrom().SetJournal().SetImp().SetDate().SetStd().SetYear(2009);
10010         expected_errors[0]->SetSeverity(eDiag_Error);
10011         expected_errors[0]->SetErrMsg("Journal title missing");
10012         expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "MissingISOJTA",
10013                                   "ISO journal title abbreviation missing"));
10014         eval = validator.Validate(seh, options);
10015         CheckErrors (*eval, expected_errors);
10016         CRef<CCit_jour::TTitle::C_E> journal_title(new CCit_jour::TTitle::C_E());
10017         journal_title->SetName("journal_title");
10018         pub->SetArticle().SetFrom().SetJournal().SetTitle().Set().push_back(journal_title);
10019         delete expected_errors[0];
10020         expected_errors[0] = NULL;
10021         eval = validator.Validate(seh, options);
10022         CheckErrors (*eval, expected_errors);
10023         CLEAR_ERRORS
10024 
10025         expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "MissingVolume",
10026                                   "Journal volume missing"));
10027         expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "MissingPages",
10028                                   "Journal pages missing"));
10029         //AddChromosomeNoLocation(expected_errors, entry);
10030         CRef<CCit_jour::TTitle::C_E> iso_jta(new CCit_jour::TTitle::C_E());
10031         iso_jta->SetIso_jta("abbr");
10032         pub->SetArticle().SetFrom().SetJournal().SetTitle().Set().push_back(iso_jta);
10033         pub->SetArticle().SetFrom().SetJournal().SetImp().ResetVolume();
10034         pub->SetArticle().SetFrom().SetJournal().SetImp().ResetPages();
10035         eval = validator.Validate(seh, options);
10036         CheckErrors (*eval, expected_errors);
10037         CLEAR_ERRORS
10038         expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "MissingPages",
10039                                   "Journal pages missing"));
10040         //AddChromosomeNoLocation(expected_errors, entry);
10041         pub->SetArticle().SetFrom().SetJournal().SetImp().SetVolume("vol 1");
10042         eval = validator.Validate(seh, options);
10043         CheckErrors (*eval, expected_errors);
10044         CLEAR_ERRORS
10045         expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "MissingVolume",
10046                                   "Journal volume missing"));
10047         //AddChromosomeNoLocation(expected_errors, entry);
10048         pub->SetArticle().SetFrom().SetJournal().SetImp().ResetVolume();
10049         pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("14-32");
10050         expected_errors[0]->SetErrMsg("Journal volume missing");
10051         eval = validator.Validate(seh, options);
10052         CheckErrors (*eval, expected_errors);
10053         CLEAR_ERRORS
10054         expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "MissingPubRequirement",
10055                                   "Publication date missing"));
10056         //AddChromosomeNoLocation(expected_errors, entry);
10057         pub->SetArticle().SetFrom().SetJournal().SetImp().SetVolume("vol 1");
10058         pub->SetArticle().SetFrom().SetJournal().SetImp().ResetDate();
10059         expected_errors[0]->SetErrMsg("Publication date missing");
10060         expected_errors[0]->SetSeverity(eDiag_Warning);
10061         eval = validator.Validate(seh, options);
10062         CheckErrors (*eval, expected_errors);
10063         pub->SetArticle().SetFrom().SetJournal().SetImp().SetDate().SetStr("?");
10064         expected_errors[0]->SetErrMsg("Publication date marked as '?'");
10065         eval = validator.Validate(seh, options);
10066         CheckErrors (*eval, expected_errors);
10067         pub->SetArticle().SetFrom().SetJournal().SetImp().SetDate().SetStd().SetYear(0);
10068         expected_errors[0]->SetErrMsg("Publication date not set");
10069         eval = validator.Validate(seh, options);
10070         CheckErrors (*eval, expected_errors);
10071 
10072         CLEAR_ERRORS
10073         //AddChromosomeNoLocation(expected_errors, entry);
10074         //suppress ISOJTA warning if electronic journal
10075         pub->SetArticle().SetFrom().SetJournal().SetImp().SetDate().SetStd().SetYear(2009);
10076         pub->SetArticle().SetFrom().SetJournal().SetTitle().Set().pop_back();
10077         journal_title->SetName("(er) Journal Title");
10078         eval = validator.Validate(seh, options);
10079         CheckErrors (*eval, expected_errors);
10080         journal_title->SetName("(journal title");
10081         pub->SetArticle().SetFrom().SetJournal().SetImp().SetPubstatus(ePubStatus_epublish);
10082         eval = validator.Validate(seh, options);
10083         CheckErrors (*eval, expected_errors);
10084         pub->SetArticle().SetFrom().SetJournal().SetImp().SetPubstatus(ePubStatus_aheadofprint);
10085         pub->SetArticle().SetFrom().SetJournal().SetImp().SetPrepub(CImprint::ePrepub_in_press);
10086         expected_errors.push_back(new CExpectedError(msg_acc, eDiag_Warning, "PublicationInconsistency",
10087                                   "In-press is not expected to have page numbers"));
10088         eval = validator.Validate(seh, options);
10089         CheckErrors (*eval, expected_errors);
10090         CLEAR_ERRORS
10091 
10092         entry->SetDescr().Set().pop_back();
10093     }
10094 }
10095 
10096 
BOOST_AUTO_TEST_CASE(Test_Generic_UnnecessaryPubEquiv)10097 BOOST_AUTO_TEST_CASE(Test_Generic_UnnecessaryPubEquiv)
10098 {
10099     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
10100 
10101     CRef<CPub> pub(new CPub());
10102     pub->SetEquiv();
10103     CRef<CSeqdesc> desc(new CSeqdesc());
10104     desc->SetPub().SetPub().Set().push_back(pub);
10105     entry->SetDescr().Set().push_back(desc);
10106 
10107     STANDARD_SETUP
10108 
10109     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnnecessaryPubEquiv",
10110                               "Publication has unexpected internal Pub-equiv"));
10111     //AddChromosomeNoLocation(expected_errors, entry);
10112     eval = validator.Validate(seh, options);
10113     CheckErrors (*eval, expected_errors);
10114 
10115     CLEAR_ERRORS
10116 }
10117 
10118 
BOOST_AUTO_TEST_CASE(Test_Generic_BadPageNumbering)10119 BOOST_AUTO_TEST_CASE(Test_Generic_BadPageNumbering)
10120 {
10121     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
10122 
10123     CRef<CPub> pub = unit_test_util::BuildGoodArticlePub();
10124     CRef<CSeqdesc> desc(new CSeqdesc());
10125     desc->SetPub().SetPub().Set().push_back(pub);
10126     entry->SetDescr().Set().push_back(desc);
10127 
10128     STANDARD_SETUP
10129 
10130     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("0-32");
10131     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadPageNumbering",
10132                               "Page numbering has zero value"));
10133     //AddChromosomeNoLocation(expected_errors, entry);
10134     eval = validator.Validate(seh, options);
10135     CheckErrors (*eval, expected_errors);
10136     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("14-0");
10137     eval = validator.Validate(seh, options);
10138     CheckErrors (*eval, expected_errors);
10139 
10140     expected_errors[0]->SetErrMsg("Page numbering has negative value");
10141     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("14--32");
10142     eval = validator.Validate(seh, options);
10143     CheckErrors (*eval, expected_errors);
10144 
10145     expected_errors[0]->SetErrMsg("Page numbering out of order");
10146     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("32-14");
10147     eval = validator.Validate(seh, options);
10148     CheckErrors (*eval, expected_errors);
10149 
10150     expected_errors[0]->SetErrMsg("Page numbering greater than 50");
10151     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("14-65");
10152     eval = validator.Validate(seh, options);
10153     CheckErrors (*eval, expected_errors);
10154 
10155     expected_errors[0]->SetErrMsg("Page numbering stop looks strange");
10156     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("14-A");
10157     eval = validator.Validate(seh, options);
10158     CheckErrors (*eval, expected_errors);
10159 
10160     expected_errors[0]->SetErrMsg("Page numbering start looks strange");
10161     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages(".14-32");
10162     eval = validator.Validate(seh, options);
10163     CheckErrors (*eval, expected_errors);
10164 
10165     CLEAR_ERRORS
10166 }
10167 
10168 
BOOST_AUTO_TEST_CASE(Test_Generic_MedlineEntryPub)10169 BOOST_AUTO_TEST_CASE(Test_Generic_MedlineEntryPub)
10170 {
10171     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
10172 
10173     CRef<CPub> pub(new CPub());
10174     pub->SetMedline();
10175     CRef<CSeqdesc> desc(new CSeqdesc());
10176     desc->SetPub().SetPub().Set().push_back(pub);
10177     entry->SetDescr().Set().push_back(desc);
10178 
10179     STANDARD_SETUP
10180 
10181     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MedlineEntryPub",
10182                               "Publication is medline entry"));
10183     //AddChromosomeNoLocation(expected_errors, entry);
10184     eval = validator.Validate(seh, options);
10185     CheckErrors (*eval, expected_errors);
10186 
10187     CLEAR_ERRORS
10188 }
10189 
10190 
MakeBadSeasonDate(CDate & date)10191 static void MakeBadSeasonDate(CDate& date)
10192 {
10193     date.SetStd().SetYear(2009);
10194     date.SetStd().SetMonth(12);
10195     date.SetStd().SetDay(31);
10196     date.SetStd().SetSeason("1");
10197 }
10198 
10199 
BOOST_AUTO_TEST_CASE(Test_Generic_BadDate)10200 BOOST_AUTO_TEST_CASE(Test_Generic_BadDate)
10201 {
10202     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
10203 
10204     // find sub pub and other pub
10205     CRef<CPub> subpub(NULL);
10206     CRef<CPub> otherpub(NULL);
10207     NON_CONST_ITERATE(CBioseq::TDescr::Tdata, it, entry->SetSeq().SetDescr().Set()) {
10208         if ((*it)->IsPub()) {
10209             if ((*it)->GetPub().GetPub().Get().front()->IsSub()) {
10210                 subpub = (*it)->SetPub().SetPub().Set().front();
10211             } else {
10212                 otherpub = (*it)->SetPub().SetPub().Set().front();
10213             }
10214         }
10215     }
10216 
10217     STANDARD_SETUP
10218 
10219     subpub->SetSub().SetDate().SetStr("?");
10220     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadDate",
10221                               "Submission citation date has error - BAD_STR"));
10222     //AddChromosomeNoLocation(expected_errors, entry);
10223     eval = validator.Validate(seh, options);
10224     CheckErrors (*eval, expected_errors);
10225 
10226     subpub->SetSub().SetDate().SetStd().SetYear(0);
10227     expected_errors[0]->SetErrMsg("Submission citation date has error - BAD_YEAR");
10228     eval = validator.Validate(seh, options);
10229     CheckErrors (*eval, expected_errors);
10230 
10231     subpub->SetSub().SetDate().SetStd().SetYear(2009);
10232     subpub->SetSub().SetDate().SetStd().SetMonth(13);
10233     expected_errors[0]->SetErrMsg("Submission citation date has error - BAD_MONTH");
10234     eval = validator.Validate(seh, options);
10235     CheckErrors (*eval, expected_errors);
10236 
10237     subpub->SetSub().SetDate().SetStd().SetYear(2009);
10238     subpub->SetSub().SetDate().SetStd().SetMonth(12);
10239     subpub->SetSub().SetDate().SetStd().SetDay(32);
10240     expected_errors[0]->SetErrMsg("Submission citation date has error - BAD_DAY");
10241     eval = validator.Validate(seh, options);
10242     CheckErrors (*eval, expected_errors);
10243 
10244     MakeBadSeasonDate(subpub->SetSub().SetDate());
10245     expected_errors[0]->SetErrMsg("Submission citation date has error - BAD_SEASON");
10246     eval = validator.Validate(seh, options);
10247     CheckErrors (*eval, expected_errors);
10248     subpub->Assign(*(unit_test_util::BuildGoodCitSubPub()));
10249 
10250     CRef<CAuthor> author = unit_test_util::BuildGoodAuthor();
10251     CRef<CPub> gen(new CPub());
10252     gen->SetGen().SetAuthors().SetNames().SetStd().push_back(author);
10253     gen->SetGen().SetTitle("gen title");
10254     MakeBadSeasonDate(gen->SetGen().SetDate());
10255     otherpub->Assign(*gen);
10256     expected_errors[0]->SetErrMsg("Publication date has error - BAD_SEASON");
10257     eval = validator.Validate(seh, options);
10258     CheckErrors (*eval, expected_errors);
10259 
10260     otherpub->Assign(*(unit_test_util::BuildGoodArticlePub()));
10261     MakeBadSeasonDate(otherpub->SetArticle().SetFrom().SetJournal().SetImp().SetDate());
10262     eval = validator.Validate(seh, options);
10263     CheckErrors (*eval, expected_errors);
10264     otherpub->Assign(*(unit_test_util::BuildGoodArticlePub()));
10265 
10266     CRef<CSeqdesc> desc(new CSeqdesc());
10267     entry->SetDescr().Set().push_back(desc);
10268     MakeBadSeasonDate(desc->SetCreate_date());
10269     expected_errors[0]->SetErrMsg("Create date has error - BAD_SEASON");
10270     eval = validator.Validate(seh, options);
10271     CheckErrors (*eval, expected_errors);
10272 
10273     MakeBadSeasonDate(desc->SetUpdate_date());
10274     expected_errors[0]->SetErrMsg("Update date has error - BAD_SEASON");
10275     eval = validator.Validate(seh, options);
10276     CheckErrors (*eval, expected_errors);
10277 
10278     CLEAR_ERRORS
10279 }
10280 
10281 
BOOST_AUTO_TEST_CASE(Test_Generic_StructuredCitGenCit)10282 BOOST_AUTO_TEST_CASE(Test_Generic_StructuredCitGenCit)
10283 {
10284     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
10285 
10286     CRef<CPub> pub(new CPub());
10287     pub->SetGen().SetAuthors().SetNames().SetStd().push_back(unit_test_util::BuildGoodAuthor());
10288     pub->SetGen().SetTitle("gen title");
10289     pub->SetGen().SetDate().SetStd().SetYear(2009);
10290     pub->SetGen().SetCit("submitted something Title=foo");
10291     CRef<CSeqdesc> desc(new CSeqdesc());
10292     desc->SetPub().SetPub().Set().push_back(pub);
10293     entry->SetDescr().Set().push_back(desc);
10294 
10295     STANDARD_SETUP
10296 
10297     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "StructuredCitGenCit",
10298                               "Unpublished citation has embedded Title"));
10299     //AddChromosomeNoLocation(expected_errors, entry);
10300     eval = validator.Validate(seh, options);
10301     CheckErrors (*eval, expected_errors);
10302 
10303     pub->SetGen().SetCit("submitted something Journal=bar");
10304     expected_errors[0]->SetErrMsg("Unpublished citation has embedded Journal");
10305     eval = validator.Validate(seh, options);
10306     CheckErrors (*eval, expected_errors);
10307 
10308     CLEAR_ERRORS
10309 }
10310 
10311 
BOOST_AUTO_TEST_CASE(Test_Generic_CollidingSerialNumbers)10312 BOOST_AUTO_TEST_CASE(Test_Generic_CollidingSerialNumbers)
10313 {
10314     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
10315     CRef<CAuthor> blank;
10316 
10317     CRef<CPub> pub = unit_test_util::BuildGoodCitGenPub(blank, 1234);
10318     CRef<CSeqdesc> desc(new CSeqdesc());
10319     desc->SetPub().SetPub().Set().push_back(pub);
10320     entry->SetDescr().Set().push_back(desc);
10321 
10322     CRef<CSeq_feat> feat(new CSeq_feat());
10323     feat->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
10324     feat->SetLocation().SetInt().SetFrom(0);
10325     feat->SetLocation().SetInt().SetTo(15);
10326     feat->SetData().SetPub().SetPub().Set().push_back(unit_test_util::BuildGoodCitGenPub(blank, 1234));
10327     unit_test_util::AddFeat(feat, entry);
10328     STANDARD_SETUP
10329 
10330     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CollidingSerialNumbers",
10331                               "Multiple publications have serial number 1234"));
10332     //AddChromosomeNoLocation(expected_errors, entry);
10333     eval = validator.Validate(seh, options);
10334     CheckErrors (*eval, expected_errors);
10335 
10336     CLEAR_ERRORS
10337 }
10338 
10339 
BOOST_AUTO_TEST_CASE(Test_Generic_EmbeddedScript)10340 BOOST_AUTO_TEST_CASE(Test_Generic_EmbeddedScript)
10341 {
10342     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
10343     CRef<CAuthor> author = unit_test_util::BuildGoodAuthor();
10344     author->SetName().SetName().SetLast("foo<script");
10345 
10346     CRef<CPub> pub = unit_test_util::BuildGoodCitGenPub(author, -1);
10347     CRef<CSeqdesc> desc(new CSeqdesc());
10348     desc->SetPub().SetPub().Set().push_back(pub);
10349     entry->SetDescr().Set().push_back(desc);
10350 
10351     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
10352 
10353     STANDARD_SETUP
10354 
10355     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadCharInAuthorLastName",
10356                               "Bad characters in author foo<script"));
10357     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "EmbeddedScript",
10358                               "Script tag found in item"));
10359     //AddChromosomeNoLocation(expected_errors, entry);
10360     eval = validator.Validate(seh, options);
10361     CheckErrors (*eval, expected_errors);
10362 
10363     author->SetName().SetName().SetLast("Last");
10364     delete expected_errors[0];
10365     expected_errors[0] = NULL;
10366 
10367     feat->SetComment("<object");
10368     eval = validator.Validate(seh, options);
10369     CheckErrors (*eval, expected_errors);
10370     feat->ResetComment();
10371     feat->SetComment("misc_feature needs a comment");
10372 
10373     feat->SetTitle("<applet");
10374     eval = validator.Validate(seh, options);
10375     CheckErrors (*eval, expected_errors);
10376     feat->ResetTitle();
10377 
10378     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_cell_line, "<embed");
10379     eval = validator.Validate(seh, options);
10380     CheckErrors (*eval, expected_errors);
10381     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_cell_line, "");
10382 
10383     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_acronym, "<form");
10384     eval = validator.Validate(seh, options);
10385     CheckErrors (*eval, expected_errors);
10386     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_acronym, "");
10387 
10388     pub->SetGen().SetTitle("javascript:");
10389     eval = validator.Validate(seh, options);
10390     CheckErrors (*eval, expected_errors);
10391     pub->SetGen().SetTitle("good title");
10392 
10393     unit_test_util::SetLineage(entry, "vbscript:");
10394     eval = validator.Validate(seh, options);
10395     CheckErrors (*eval, expected_errors);
10396     unit_test_util::SetLineage(entry, "");
10397 
10398     CLEAR_ERRORS
10399 }
10400 
10401 
BOOST_AUTO_TEST_CASE(Test_Generic_PublicationInconsistency)10402 BOOST_AUTO_TEST_CASE(Test_Generic_PublicationInconsistency)
10403 {
10404     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
10405     CRef<CSeqdesc> desc(new CSeqdesc());
10406     CRef<CPub> pub = unit_test_util::BuildGoodArticlePub();
10407     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPubstatus(ePubStatus_aheadofprint);
10408     desc->SetPub().SetPub().Set().push_back(pub);
10409     entry->SetSeq().SetDescr().Set().push_back(desc);
10410 
10411     STANDARD_SETUP
10412     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PublicationInconsistency",
10413                                                  "Ahead-of-print without in-press"));
10414     //AddChromosomeNoLocation(expected_errors, entry);
10415     eval = validator.Validate(seh, options);
10416     CheckErrors (*eval, expected_errors);
10417 
10418     CLEAR_ERRORS
10419     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPubstatus(ePubStatus_epublish);
10420     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPrepub(CImprint::ePrepub_in_press);
10421     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PublicationInconsistency",
10422                                                  "In-press is not expected to have page numbers"));
10423     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PublicationInconsistency",
10424                                                  "Electronic-only publication should not also be in-press"));
10425     //AddChromosomeNoLocation(expected_errors, entry);
10426     eval = validator.Validate(seh, options);
10427     CheckErrors (*eval, expected_errors);
10428     pub->SetArticle().SetFrom().SetJournal().SetImp().ResetPubstatus();
10429     pub->SetArticle().SetFrom().SetJournal().SetImp().ResetPrepub();
10430 
10431     CLEAR_ERRORS
10432     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PublicationInconsistency",
10433                                                  "Empty consortium"));
10434     //AddChromosomeNoLocation(expected_errors, entry);
10435     CRef<CAuthor> consortium(new CAuthor());
10436     consortium->SetName().SetConsortium("");
10437     pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(consortium);
10438     eval = validator.Validate(seh, options);
10439     CheckErrors (*eval, expected_errors);
10440 
10441     consortium->SetName().SetConsortium("duplicate");
10442     CRef<CAuthor> consortium2(new CAuthor());
10443     consortium2->SetName().SetConsortium("duplicate");
10444     pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(consortium2);
10445     expected_errors[0]->SetErrMsg("Duplicate consortium 'duplicate'");
10446     eval = validator.Validate(seh, options);
10447     CheckErrors (*eval, expected_errors);
10448 
10449     CLEAR_ERRORS
10450     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PublicationInconsistency",
10451                                                  "In-press is not expected to have page numbers"));
10452     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PublicationInconsistency",
10453                                                  "Duplicate consortium 'duplicate'"));
10454     //AddChromosomeNoLocation(expected_errors, entry);
10455     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPrepub(CImprint::ePrepub_in_press);
10456     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPages("75-84");
10457     eval = validator.Validate(seh, options);
10458     CheckErrors (*eval, expected_errors);
10459     pub->SetArticle().SetFrom().SetJournal().SetImp().ResetPrepub();
10460     pub->SetArticle().SetFrom().SetJournal().SetImp().ResetPages();
10461 
10462     CLEAR_ERRORS
10463 }
10464 
10465 
AddSgmlError(vector<CExpectedError * > & expected_errors,const string & valtype,const string & val)10466 void AddSgmlError
10467 (vector< CExpectedError *>& expected_errors,
10468  const string& valtype,
10469  const string& val)
10470 {
10471     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SgmlPresentInText", valtype + " " + val + " has SGML"));
10472 }
10473 
BOOST_AUTO_TEST_CASE(Test_Generic_SgmlPresentInText)10474 BOOST_AUTO_TEST_CASE(Test_Generic_SgmlPresentInText)
10475 {
10476     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
10477 
10478     STANDARD_SETUP
10479 
10480     vector<string> sgml_tags;
10481 
10482     sgml_tags.push_back("&gt;");
10483     sgml_tags.push_back("&lt;");
10484     sgml_tags.push_back("&amp;");
10485     sgml_tags.push_back("&agr;");
10486     sgml_tags.push_back("&Agr;");
10487     sgml_tags.push_back("&bgr;");
10488     sgml_tags.push_back("&Bgr;");
10489     sgml_tags.push_back("&ggr;");
10490     sgml_tags.push_back("&Ggr;");
10491     sgml_tags.push_back("&dgr;");
10492     sgml_tags.push_back("&Dgr;");
10493     sgml_tags.push_back("&egr;");
10494     sgml_tags.push_back("&Egr;");
10495     sgml_tags.push_back("&zgr;");
10496     sgml_tags.push_back("&Zgr;");
10497     sgml_tags.push_back("&eegr;");
10498     sgml_tags.push_back("&EEgr;");
10499     sgml_tags.push_back("&thgr;");
10500     sgml_tags.push_back("&THgr;");
10501     sgml_tags.push_back("&igr;");
10502     sgml_tags.push_back("&Igr;");
10503     sgml_tags.push_back("&kgr;");
10504     sgml_tags.push_back("&Kgr;");
10505     sgml_tags.push_back("&lgr;");
10506     sgml_tags.push_back("&Lgr;");
10507     sgml_tags.push_back("&mgr;");
10508     sgml_tags.push_back("&Mgr;");
10509     sgml_tags.push_back("&ngr;");
10510     sgml_tags.push_back("&Ngr;");
10511     sgml_tags.push_back("&xgr;");
10512     sgml_tags.push_back("&Xgr;");
10513     sgml_tags.push_back("&ogr;");
10514     sgml_tags.push_back("&Ogr;");
10515     sgml_tags.push_back("&pgr;");
10516     sgml_tags.push_back("&Pgr;");
10517     sgml_tags.push_back("&rgr;");
10518     sgml_tags.push_back("&Rgr;");
10519     sgml_tags.push_back("&sgr;");
10520     sgml_tags.push_back("&Sgr;");
10521     sgml_tags.push_back("&sfgr;");
10522     sgml_tags.push_back("&tgr;");
10523     sgml_tags.push_back("&Tgr;");
10524     sgml_tags.push_back("&ugr;");
10525     sgml_tags.push_back("&Ugr;");
10526     sgml_tags.push_back("&phgr;");
10527     sgml_tags.push_back("&PHgr;");
10528     sgml_tags.push_back("&khgr;");
10529     sgml_tags.push_back("&KHgr;");
10530     sgml_tags.push_back("&psgr;");
10531     sgml_tags.push_back("&PSgr;");
10532     sgml_tags.push_back("&ohgr;");
10533     sgml_tags.push_back("&OHgr;");
10534 
10535     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SgmlPresentInText",
10536                               "taxname %s has SGML"));
10537     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrganismNotFound",
10538                               "Organism not found in taxonomy database"));
10539     //AddChromosomeNoLocation(expected_errors, entry);
10540     ITERATE(vector<string>, it, sgml_tags) {
10541         string taxname = "a" + *it + "b";
10542         unit_test_util::SetTaxname(entry, taxname);
10543         expected_errors[0]->SetErrMsg("taxname " + taxname + " has SGML");
10544         eval = validator.Validate(seh, options);
10545         CheckErrors (*eval, expected_errors);
10546     }
10547 
10548     unit_test_util::SetSebaea_microphylla(entry);
10549     delete expected_errors[1];
10550     expected_errors[1] = NULL;
10551 
10552     size_t tag_num = 0;
10553 
10554     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_cell_line, sgml_tags[tag_num]);
10555     expected_errors[0]->SetErrMsg("subsource " + sgml_tags[tag_num] + " has SGML");
10556     eval = validator.Validate(seh, options);
10557     CheckErrors (*eval, expected_errors);
10558     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_cell_line, "");
10559 
10560     ++tag_num;
10561     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_acronym, sgml_tags[tag_num]);
10562     expected_errors[0]->SetErrMsg("orgmod " + sgml_tags[tag_num] + " has SGML");
10563     eval = validator.Validate(seh, options);
10564     CheckErrors (*eval, expected_errors);
10565     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_acronym, "");
10566 
10567     CLEAR_ERRORS
10568     tag_num++;
10569     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SgmlPresentInText",
10570                               "dbxref database " + sgml_tags[tag_num] + " has SGML"));
10571     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "IllegalDbXref",
10572                               "Illegal db_xref type " + sgml_tags[tag_num] + " (1234)"));
10573     //AddChromosomeNoLocation(expected_errors, entry);
10574 
10575     unit_test_util::SetDbxref (entry, sgml_tags[tag_num], 1234);
10576     eval = validator.Validate(seh, options);
10577     CheckErrors (*eval, expected_errors);
10578     unit_test_util::RemoveDbxref (entry, sgml_tags[tag_num], 1234);
10579 
10580     CLEAR_ERRORS
10581 
10582     tag_num++;
10583     AddSgmlError(expected_errors, "dbxref value", sgml_tags[tag_num]);
10584     //AddChromosomeNoLocation(expected_errors, entry);
10585     unit_test_util::SetDbxref (entry, "AFTOL", sgml_tags[tag_num]);
10586     eval = validator.Validate(seh, options);
10587     CheckErrors (*eval, expected_errors);
10588     unit_test_util::RemoveDbxref (entry, "AFTOL", 0);
10589 
10590     CLEAR_ERRORS
10591     ++tag_num;
10592     scope.RemoveTopLevelSeqEntry(seh);
10593     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature (entry);
10594     seh = scope.AddTopLevelSeqEntry(*entry);
10595     AddSgmlError(expected_errors, "dbxref database", sgml_tags[tag_num]);
10596     //AddChromosomeNoLocation(expected_errors, entry);
10597     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "IllegalDbXref",
10598                               "Illegal db_xref type " + sgml_tags[tag_num] + " (1234)"));
10599     unit_test_util::SetDbxref(feat, sgml_tags[tag_num], 1234);
10600     eval = validator.Validate(seh, options);
10601     CheckErrors (*eval, expected_errors);
10602     unit_test_util::RemoveDbxref (feat, sgml_tags[tag_num], 1234);
10603 
10604     CLEAR_ERRORS
10605 
10606     tag_num++;
10607     unit_test_util::SetDbxref (feat, "AFTOL", sgml_tags[tag_num]);
10608     AddSgmlError(expected_errors, "dbxref value", sgml_tags[tag_num]);
10609     //AddChromosomeNoLocation(expected_errors, entry);
10610 
10611     eval = validator.Validate(seh, options);
10612     CheckErrors (*eval, expected_errors);
10613     unit_test_util::RemoveDbxref (feat, "AFTOL", 0);
10614 
10615     CLEAR_ERRORS
10616 
10617     tag_num++;
10618     scope.RemoveTopLevelSeqEntry(seh);
10619     string foo = sgml_tags[tag_num] + "foo";
10620     feat->SetData().SetGene().SetLocus(foo);
10621     seh = scope.AddTopLevelSeqEntry(*entry);
10622     AddSgmlError(expected_errors, "gene locus", foo);
10623     //AddChromosomeNoLocation(expected_errors, entry);
10624     eval = validator.Validate(seh, options);
10625     CheckErrors (*eval, expected_errors);
10626     feat->SetData().SetGene().SetLocus("good locus");
10627 
10628     CLEAR_ERRORS
10629     tag_num++;
10630     feat->SetData().SetGene().SetLocus_tag(sgml_tags[tag_num]);
10631     AddSgmlError(expected_errors, "gene locus_tag", sgml_tags[tag_num]);
10632     //AddChromosomeNoLocation(expected_errors, entry);
10633     eval = validator.Validate(seh, options);
10634     CheckErrors (*eval, expected_errors);
10635     feat->SetData().SetGene().ResetLocus_tag();
10636 
10637     CLEAR_ERRORS
10638     tag_num++;
10639     feat->SetData().SetGene().SetDesc(sgml_tags[tag_num]);
10640     AddSgmlError(expected_errors, "gene description", sgml_tags[tag_num]);
10641     //AddChromosomeNoLocation(expected_errors, entry);
10642     eval = validator.Validate(seh, options);
10643     CheckErrors (*eval, expected_errors);
10644     feat->SetData().SetGene().ResetDesc();
10645 
10646     CLEAR_ERRORS
10647     tag_num++;
10648     feat->SetData().SetGene().SetSyn().push_back(sgml_tags[tag_num]);
10649     AddSgmlError(expected_errors, "gene synonym", sgml_tags[tag_num]);
10650     //AddChromosomeNoLocation(expected_errors, entry);
10651     eval = validator.Validate(seh, options);
10652     CheckErrors (*eval, expected_errors);
10653     feat->SetData().SetGene().ResetDesc();
10654 
10655     CLEAR_ERRORS
10656 
10657     tag_num++;
10658     scope.RemoveTopLevelSeqEntry(seh);
10659     feat->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
10660     foo = sgml_tags[tag_num] + "foo";
10661     feat->SetData().SetRna().SetExt().SetName(foo);
10662     seh = scope.AddTopLevelSeqEntry(*entry);
10663     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
10664                               "No CDS location match for 1 mRNA"));
10665 
10666     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SgmlPresentInText",
10667                               "mRNA name " + foo + " has SGML"));
10668     //AddChromosomeNoLocation(expected_errors, entry);
10669     eval = validator.Validate(seh, options);
10670     CheckErrors (*eval, expected_errors);
10671 
10672     CLEAR_ERRORS;
10673 
10674     tag_num++;
10675     scope.RemoveTopLevelSeqEntry(seh);
10676     feat->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
10677     foo = sgml_tags[tag_num] + "foo";
10678     feat->SetData().SetRna().SetExt().SetName(foo);
10679     seh = scope.AddTopLevelSeqEntry(*entry);
10680     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SgmlPresentInText",
10681                               "rRNA name " + foo + " has SGML"));
10682     //AddChromosomeNoLocation(expected_errors, entry);
10683     eval = validator.Validate(seh, options);
10684     CheckErrors (*eval, expected_errors);
10685     feat->SetData().SetRna().SetExt().SetName("good name");
10686 
10687     tag_num++;
10688     feat->SetComment(sgml_tags[tag_num]);
10689     expected_errors[0]->SetErrMsg("feature comment " + sgml_tags[tag_num] + " has SGML");
10690     eval = validator.Validate(seh, options);
10691     CheckErrors (*eval, expected_errors);
10692     feat->ResetComment();
10693 
10694     tag_num++;
10695     CRef<CGb_qual> qual(new CGb_qual());
10696     qual->SetQual("standard_name");
10697     qual->SetVal(sgml_tags[tag_num]);
10698     feat->SetQual().push_back(qual);
10699     expected_errors[0]->SetErrMsg("feature qualifier " + sgml_tags[tag_num] + " has SGML");
10700     eval = validator.Validate(seh, options);
10701     CheckErrors (*eval, expected_errors);
10702     feat->SetQual().pop_back();
10703 
10704     tag_num++;
10705     scope.RemoveTopLevelSeqEntry(seh);
10706     entry = unit_test_util::BuildGoodNucProtSet ();
10707     feat = entry->SetSet().SetSeq_set().back()->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
10708     foo = sgml_tags[tag_num] + "foo";
10709     feat->SetData().SetProt().SetName().front().assign(foo);
10710     seh = scope.AddTopLevelSeqEntry(*entry);
10711     expected_errors[0]->SetAccession("lcl|prot");
10712     expected_errors[0]->SetErrMsg("protein name " + foo + " has SGML");
10713     eval = validator.Validate(seh, options);
10714     CheckErrors (*eval, expected_errors);
10715     feat->SetData().SetProt().SetName().pop_back();
10716     feat->SetData().SetProt().SetName().push_back("bar");
10717 
10718 
10719     tag_num++;
10720     feat->SetData().SetProt().SetDesc(sgml_tags[tag_num]);
10721     expected_errors[0]->SetErrMsg("protein description " + sgml_tags[tag_num] + " has SGML");
10722     eval = validator.Validate(seh, options);
10723     CheckErrors (*eval, expected_errors);
10724     feat->SetData().SetProt().ResetDesc();
10725     CLEAR_ERRORS
10726 }
10727 
10728 
BOOST_AUTO_TEST_CASE(Test_Generic_UnexpectedPubStatusComment)10729 BOOST_AUTO_TEST_CASE(Test_Generic_UnexpectedPubStatusComment)
10730 {
10731     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
10732     CRef<CPub> pub = unit_test_util::BuildGoodArticlePub();
10733     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPubstatus(ePubStatus_epublish);
10734     CRef<CSeqdesc> desc(new CSeqdesc());
10735     desc->SetPub().SetPub().Set().push_back(pub);
10736     desc->SetPub().SetComment("Publication Status");
10737     entry->SetSeq().SetDescr().Set().push_back(desc);
10738 
10739     STANDARD_SETUP
10740     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnexpectedPubStatusComment",
10741                                                  "Publication status is in comment for pmid 0"));
10742     //AddChromosomeNoLocation(expected_errors, entry);
10743     eval = validator.Validate(seh, options);
10744     CheckErrors (*eval, expected_errors);
10745 
10746     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPubstatus(ePubStatus_ppublish);
10747     eval = validator.Validate(seh, options);
10748     CheckErrors (*eval, expected_errors);
10749 
10750     CLEAR_ERRORS
10751     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PublicationInconsistency",
10752                               "In-press is not expected to have page numbers"));
10753     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnexpectedPubStatusComment",
10754                                                  "Publication status is in comment for pmid 0"));
10755     //AddChromosomeNoLocation(expected_errors, entry);
10756 
10757     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPubstatus(ePubStatus_aheadofprint);
10758     pub->SetArticle().SetFrom().SetJournal().SetImp().SetPrepub(CImprint::ePrepub_in_press);
10759     eval = validator.Validate(seh, options);
10760     CheckErrors (*eval, expected_errors);
10761 
10762     desc->SetPub().SetComment("Publication-Status");
10763     eval = validator.Validate(seh, options);
10764     CheckErrors (*eval, expected_errors);
10765 
10766     desc->SetPub().SetComment("Publication_Status");
10767     eval = validator.Validate(seh, options);
10768     CheckErrors (*eval, expected_errors);
10769 
10770     CLEAR_ERRORS
10771 }
10772 
10773 
BOOST_AUTO_TEST_CASE(Test_PKG_NoCdRegionPtr)10774 BOOST_AUTO_TEST_CASE(Test_PKG_NoCdRegionPtr)
10775 {
10776     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
10777 
10778     CRef<CSeq_entry> pentry = unit_test_util::BuildGoodProtSeq();
10779     EDIT_EACH_DESCRIPTOR_ON_BIOSEQ (it, pentry->SetSeq()) {
10780         if ((*it)->IsSource() || (*it)->IsPub()) {
10781             ERASE_DESCRIPTOR_ON_BIOSEQ(it, pentry->SetSeq());
10782         }
10783     }
10784 
10785     entry->SetSet().SetSeq_set().push_back(pentry);
10786 
10787     STANDARD_SETUP
10788 
10789     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoCdRegionPtr",
10790                                                  "No CdRegion in nuc-prot set points to this protein"));
10791     //AddChromosomeNoLocation(expected_errors, entry);
10792     eval = validator.Validate(seh, options);
10793     CheckErrors (*eval, expected_errors);
10794 
10795 
10796     CLEAR_ERRORS
10797 }
10798 
10799 
BOOST_AUTO_TEST_CASE(Test_PKG_NucProtProblem)10800 BOOST_AUTO_TEST_CASE(Test_PKG_NucProtProblem)
10801 {
10802     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
10803     CRef<CSeq_entry> nentry = entry->SetSet().SetSeq_set().front();
10804     entry->SetSet().SetSeq_set().pop_front();
10805     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
10806     entry->SetSet().SetAnnot().front()->SetData().SetFtable().pop_front();
10807 
10808     STANDARD_SETUP
10809 
10810     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "NoCdRegionPtr",
10811                                                  "No CdRegion in nuc-prot set points to this protein"));
10812     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "NucProtProblem",
10813                                                  "No nucleotides in nuc-prot set"));
10814     //AddChromosomeNoLocation(expected_errors, entry);
10815     eval = validator.Validate(seh, options);
10816     CheckErrors (*eval, expected_errors);
10817 
10818     scope.RemoveTopLevelSeqEntry(seh);
10819     CRef<CSeq_entry> pentry = entry->SetSet().SetSeq_set().front();
10820     entry->SetSet().SetSeq_set().pop_front();
10821     entry->SetSet().SetSeq_set().push_back(nentry);
10822     entry->SetSet().SetAnnot().front()->SetData().SetFtable().push_back(cds);
10823     seh = scope.AddTopLevelSeqEntry(*entry);
10824     delete expected_errors[0];
10825     expected_errors[0] = NULL;
10826     expected_errors[1]->SetErrMsg("No proteins in nuc-prot set");
10827     expected_errors[1]->SetAccession("lcl|nuc");
10828     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "MissingCDSproduct",
10829                                                  "Unable to find product Bioseq from CDS feature"));
10830     eval = validator.Validate(seh, options);
10831     CheckErrors (*eval, expected_errors);
10832 
10833     scope.RemoveTopLevelSeqEntry(seh);
10834     CRef<CSeq_entry> nentry2 = unit_test_util::BuildGoodSeq();
10835     EDIT_EACH_DESCRIPTOR_ON_BIOSEQ (it, nentry2->SetSeq()) {
10836         if ((*it)->IsSource() || (*it)->IsPub()) {
10837             ERASE_DESCRIPTOR_ON_BIOSEQ(it, nentry2->SetSeq());
10838         }
10839     }
10840     entry->SetSet().SetSeq_set().push_back(nentry2);
10841     entry->SetSet().SetSeq_set().push_back(pentry);
10842     seh = scope.AddTopLevelSeqEntry(*entry);
10843     expected_errors[1]->SetSeverity(eDiag_Critical);
10844     expected_errors[1]->SetErrMsg("Multiple unsegmented nucleotides in nuc-prot set");
10845     delete expected_errors[2];
10846     expected_errors.pop_back();
10847     eval = validator.Validate(seh, options);
10848     CheckErrors (*eval, expected_errors);
10849 
10850     CLEAR_ERRORS
10851 }
10852 
10853 
BOOST_AUTO_TEST_CASE(Test_PKG_SegSetProblem)10854 BOOST_AUTO_TEST_CASE(Test_PKG_SegSetProblem)
10855 {
10856     CRef<CSeq_entry> entry(new CSeq_entry());
10857     entry->SetSet().SetClass(CBioseq_set::eClass_segset);
10858     entry->SetSet().SetSeq_set().push_back(unit_test_util::BuildGoodSeq());
10859     entry->SetSet().SetSeq_set().push_back(unit_test_util::BuildGoodSeq());
10860     entry->SetSet().SetSeq_set().back()->SetSeq().SetId().front()->SetLocal().SetStr("good2");
10861 
10862     STANDARD_SETUP
10863 
10864     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SegSetProblem",
10865                                                  "No segmented Bioseq in segset"));
10866     //AddChromosomeNoLocation(expected_errors, entry);
10867     eval = validator.Validate(seh, options);
10868     CheckErrors (*eval, expected_errors);
10869 
10870     CLEAR_ERRORS
10871 }
10872 
10873 
BOOST_AUTO_TEST_CASE(Test_PKG_EmptySet)10874 BOOST_AUTO_TEST_CASE(Test_PKG_EmptySet)
10875 {
10876     CRef<CSeq_entry> entry(new CSeq_entry());
10877     entry->SetSet().SetClass(CBioseq_set::eClass_genbank);
10878     entry->SetSet().SetSeq_set().push_back(unit_test_util::BuildGoodSeq());
10879     CRef<CSeq_entry> centry(new CSeq_entry());
10880     centry->SetSet().SetClass(CBioseq_set::eClass_gi);
10881     entry->SetSet().SetSeq_set().push_back(centry);
10882 
10883     STANDARD_SETUP
10884 
10885     expected_errors.push_back(new CExpectedError("", eDiag_Warning, "EmptySet",
10886                                                  "No Bioseqs in this set"));
10887     //AddChromosomeNoLocation(expected_errors, entry);
10888     eval = validator.Validate(seh, options);
10889     CheckErrors (*eval, expected_errors);
10890 
10891     centry->SetSet().SetClass(CBioseq_set::eClass_gibb);
10892     eval = validator.Validate(seh, options);
10893     CheckErrors (*eval, expected_errors);
10894 
10895     centry->SetSet().SetClass(CBioseq_set::eClass_pir);
10896     eval = validator.Validate(seh, options);
10897     CheckErrors (*eval, expected_errors);
10898 
10899     centry->SetSet().SetClass(CBioseq_set::eClass_pub_set);
10900     eval = validator.Validate(seh, options);
10901     CheckErrors (*eval, expected_errors);
10902 
10903     centry->SetSet().SetClass(CBioseq_set::eClass_equiv);
10904     eval = validator.Validate(seh, options);
10905     CheckErrors (*eval, expected_errors);
10906 
10907     centry->SetSet().SetClass(CBioseq_set::eClass_swissprot);
10908     eval = validator.Validate(seh, options);
10909     CheckErrors (*eval, expected_errors);
10910 
10911     centry->SetSet().SetClass(CBioseq_set::eClass_pdb_entry);
10912     eval = validator.Validate(seh, options);
10913     CheckErrors (*eval, expected_errors);
10914 
10915     CLEAR_ERRORS
10916 }
10917 
10918 
BOOST_AUTO_TEST_CASE(Test_PKG_NucProtNotSegSet)10919 BOOST_AUTO_TEST_CASE(Test_PKG_NucProtNotSegSet)
10920 {
10921     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
10922     CRef<CSeq_entry> centry(new CSeq_entry());
10923     centry->SetSet().SetClass(CBioseq_set::eClass_eco_set);
10924     entry->SetSet().SetSeq_set().push_back(centry);
10925 
10926     STANDARD_SETUP
10927     expected_errors.push_back(new CExpectedError ("", eDiag_Warning, "EmptySet",
10928                                                   "Pop/Phy/Mut/Eco set has no components"));
10929     expected_errors.push_back(new CExpectedError("", eDiag_Critical, "NucProtNotSegSet",
10930                                                  "Nuc-prot Bioseq-set contains wrong Bioseq-set, its class is \"eco-set\"."));
10931     //AddChromosomeNoLocation(expected_errors, entry);
10932     eval = validator.Validate(seh, options);
10933     CheckErrors (*eval, expected_errors);
10934 
10935     CLEAR_ERRORS
10936 }
10937 
10938 
BOOST_AUTO_TEST_CASE(Test_PKG_GenomicProductPackagingProblem)10939 BOOST_AUTO_TEST_CASE(Test_PKG_GenomicProductPackagingProblem)
10940 {
10941     CRef<CSeq_entry> entry = unit_test_util::BuildGoodGenProdSet();
10942     CRef<CSeq_entry> contig = entry->SetSet().SetSeq_set().front();
10943 
10944     CRef<CSeq_entry> stray = unit_test_util::BuildGoodNucProtSet();
10945     CRef<CSeq_entry> nuc = stray->SetSet().SetSeq_set().front();
10946     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATAAACTAA");
10947     nuc->SetSeq().SetInst().SetLength(27);
10948     nuc->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
10949     unit_test_util::SetBiomol(nuc, CMolInfo::eBiomol_mRNA);
10950 
10951     unit_test_util::ChangeId(stray, "2");
10952     entry->SetSet().SetSeq_set().push_back(stray);
10953     CRef<CSeq_feat> cds(new CSeq_feat());
10954     cds->SetData().SetCdregion();
10955     cds->SetLocation().SetInt().SetFrom(30);
10956     cds->SetLocation().SetInt().SetTo(56);
10957     cds->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
10958     cds->SetProduct().SetWhole().SetLocal().SetStr("prot2");
10959     unit_test_util::AddFeat(cds, contig);
10960 
10961     STANDARD_SETUP
10962 
10963     expected_errors.push_back(new CExpectedError("lcl|nuc2", eDiag_Warning, "GenomicProductPackagingProblem",
10964                                                  "Nucleotide bioseq should be product of mRNA feature on contig, but is not"));
10965 
10966     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSwithNoMRNA",
10967                                                  "Unmatched CDS"));
10968     //AddChromosomeNoLocation(expected_errors, entry);
10969 
10970     eval = validator.Validate(seh, options);
10971     CheckErrors (*eval, expected_errors);
10972     CLEAR_ERRORS
10973 
10974     scope.RemoveTopLevelSeqEntry(seh);
10975     // take CDS away and add mrna - that way protein is orphan, nucleotide is product
10976     contig->SetSeq().SetAnnot().front()->SetData().SetFtable().pop_back();
10977 
10978     CRef<CSeq_feat> mrna (new CSeq_feat());
10979     mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
10980     mrna->SetData().SetRna().SetExt().SetName("fake protein name");
10981     mrna->SetLocation().SetInt().SetFrom(30);
10982     mrna->SetLocation().SetInt().SetTo(56);
10983     mrna->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
10984     mrna->SetProduct().SetWhole().SetLocal().SetStr("nuc2");
10985     unit_test_util::AddFeat(mrna, contig);
10986     seh = scope.AddTopLevelSeqEntry(*entry);
10987     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
10988                                                  "No CDS location match for 1 mRNA"));
10989     expected_errors.push_back(new CExpectedError("lcl|prot2", eDiag_Warning, "GenomicProductPackagingProblem",
10990                                                  "Protein bioseq should be product of CDS feature on contig, but is not"));
10991     //AddChromosomeNoLocation(expected_errors, entry);
10992 
10993     eval = validator.Validate(seh, options);
10994     CheckErrors (*eval, expected_errors);
10995 
10996     CLEAR_ERRORS
10997 
10998     // put CDS back, move annotation to gen-prod-set
10999     scope.RemoveTopLevelSeqEntry(seh);
11000     contig->SetSeq().SetAnnot().front()->SetData().SetFtable().push_back(cds);
11001     CRef<CSeq_feat> gene(new CSeq_feat());
11002     gene->SetLocation().SetInt().SetFrom(30);
11003     gene->SetLocation().SetInt().SetTo(56);
11004     gene->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
11005     gene->SetData().SetGene().SetLocus("gene locus");
11006     unit_test_util::AddFeat(gene, entry);
11007     seh = scope.AddTopLevelSeqEntry(*entry);
11008 
11009     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "GenomicProductPackagingProblem",
11010                                   "Seq-annot packaged directly on genomic product set"));
11011     //AddChromosomeNoLocation(expected_errors, entry);
11012     eval = validator.Validate(seh, options);
11013     CheckErrors (*eval, expected_errors);
11014 
11015 
11016     scope.RemoveTopLevelSeqEntry(seh);
11017     entry->SetSet().ResetAnnot();
11018     CRef<CSeq_feat> mrna2 (new CSeq_feat());
11019     mrna2->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
11020     mrna2->SetData().SetRna().SetExt().SetName("second protein name");
11021     mrna2->SetLocation().SetInt().SetFrom(27);
11022     mrna2->SetLocation().SetInt().SetTo(29);
11023     mrna2->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
11024     mrna2->SetProduct().SetWhole().SetLocal().SetStr("nuc3");
11025     unit_test_util::AddFeat(mrna2, contig);
11026     seh = scope.AddTopLevelSeqEntry(*entry);
11027 
11028     CLEAR_ERRORS
11029 
11030     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProductFetchFailure",
11031                                                  "Unable to fetch mRNA transcript 'lcl|nuc3'"));
11032     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingMRNAproduct",
11033                                                  "Product Bioseq of mRNA feature is not packaged in the record"));
11034     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GenomicProductPackagingProblem",
11035                                                  "Product of mRNA feature (lcl|nuc3) not packaged in genomic product set"));
11036     //AddChromosomeNoLocation(expected_errors, entry);
11037     eval = validator.Validate(seh, options);
11038     CheckErrors (*eval, expected_errors);
11039     CLEAR_ERRORS
11040 
11041     scope.RemoveTopLevelSeqEntry(seh);
11042     // remove product from first mRNA
11043     mrna->ResetProduct();
11044     // remove second mRNA
11045     contig->SetSeq().SetAnnot().front()->SetData().SetFtable().pop_back();
11046     seh = scope.AddTopLevelSeqEntry(*entry);
11047     eval = validator.Validate(seh, options);
11048     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatureProductInconsistency",
11049                  "2 mRNA features have 1 product references"));
11050     expected_errors.push_back(new CExpectedError("lcl|nuc2", eDiag_Warning, "GenomicProductPackagingProblem",
11051                               "Nucleotide bioseq should be product of mRNA feature on contig, but is not"));
11052     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GenomicProductPackagingProblem",
11053                  "Product of mRNA feature (?) not packaged in genomic product set"));
11054     //AddChromosomeNoLocation(expected_errors, entry);
11055     CheckErrors(*eval, expected_errors);
11056 
11057     scope.RemoveTopLevelSeqEntry(seh);
11058     mrna->SetPseudo(true);
11059     seh = scope.AddTopLevelSeqEntry(*entry);
11060     eval = validator.Validate(seh, options);
11061 
11062     CLEAR_ERRORS
11063     expected_errors.push_back(new CExpectedError("lcl|nuc2", eDiag_Warning, "GenomicProductPackagingProblem",
11064         "Nucleotide bioseq should be product of mRNA feature on contig, but is not"));
11065     //AddChromosomeNoLocation(expected_errors, entry);
11066     CheckErrors(*eval, expected_errors);
11067 
11068     CLEAR_ERRORS
11069 }
11070 
11071 
11072 #define TESTPOPPHYMUTECO(seh, entry) \
11073     entry->SetSet().SetClass(CBioseq_set::eClass_pop_set); \
11074     eval = validator.Validate(seh, options); \
11075     CheckErrors (*eval, expected_errors); \
11076     entry->SetSet().SetClass(CBioseq_set::eClass_phy_set); \
11077     eval = validator.Validate(seh, options); \
11078     CheckErrors (*eval, expected_errors); \
11079     entry->SetSet().SetClass(CBioseq_set::eClass_mut_set); \
11080     eval = validator.Validate(seh, options); \
11081     CheckErrors (*eval, expected_errors); \
11082     entry->SetSet().SetClass(CBioseq_set::eClass_eco_set); \
11083     eval = validator.Validate(seh, options); \
11084     CheckErrors (*eval, expected_errors); \
11085     entry->SetSet().SetClass(CBioseq_set::eClass_small_genome_set); \
11086     scope.RemoveTopLevelSeqEntry(seh); \
11087     unit_test_util::RemoveDescriptorType(entry, CSeqdesc::e_Title); \
11088     seh = scope.AddTopLevelSeqEntry(*entry); \
11089     eval = validator.Validate(seh, options); \
11090     CheckErrors (*eval, expected_errors);
11091 
11092 #define TESTWGS(seh, entry) \
11093     entry->SetSet().SetClass(CBioseq_set::eClass_wgs_set); \
11094     eval = validator.Validate(seh, options); \
11095     CheckErrors (*eval, expected_errors);
11096 
11097 
BOOST_AUTO_TEST_CASE(Test_PKG_InconsistentMolInfoBiomols)11098 BOOST_AUTO_TEST_CASE(Test_PKG_InconsistentMolInfoBiomols)
11099 {
11100     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
11101 
11102     STANDARD_SETUP
11103 
11104     unit_test_util::SetBiomol(entry->SetSet().SetSeq_set().front(), CMolInfo::eBiomol_cRNA);
11105     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "InconsistentMolType",
11106                                                  "Molecule type (DNA) does not match biomol (RNA)"));
11107     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "InconsistentMoltypeSet",
11108                                                  "Pop/phy/mut/eco set contains inconsistent moltype"));
11109     //AddChromosomeNoLocation(expected_errors, entry);
11110 
11111     TESTPOPPHYMUTECO (seh, entry)
11112 
11113     scope.RemoveTopLevelSeqEntry(seh);
11114     unit_test_util::RemoveDescriptorType(entry, CSeqdesc::e_Title);
11115     seh = scope.AddTopLevelSeqEntry(*entry);
11116 
11117     TESTWGS (seh, entry);
11118 
11119     CLEAR_ERRORS
11120 }
11121 
11122 
BOOST_AUTO_TEST_CASE(Test_PKG_GraphPackagingProblem)11123 BOOST_AUTO_TEST_CASE(Test_PKG_GraphPackagingProblem)
11124 {
11125     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
11126     entry->SetSeq().SetAnnot().push_back(unit_test_util::BuildGoodGraphAnnot("notgood"));
11127 
11128     STANDARD_SETUP
11129 
11130     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "GraphPackagingProblem",
11131                                                  "There is 1 mispackaged graph in this record."));
11132     //AddChromosomeNoLocation(expected_errors, entry);
11133     eval = validator.Validate(seh, options);
11134     CheckErrors (*eval, expected_errors);
11135 
11136     entry->SetSeq().SetAnnot().push_back(unit_test_util::BuildGoodGraphAnnot("alsonotgood"));
11137     expected_errors[0]->SetErrMsg("There are 2 mispackaged graphs in this record.");
11138     eval = validator.Validate(seh, options);
11139     CheckErrors (*eval, expected_errors);
11140 
11141     CLEAR_ERRORS
11142 }
11143 
11144 
BOOST_AUTO_TEST_CASE(Test_PKG_InternalGenBankSet)11145 BOOST_AUTO_TEST_CASE(Test_PKG_InternalGenBankSet)
11146 {
11147     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
11148     CRef<CSeq_entry> set(new CSeq_entry());
11149     set->SetSet().SetClass(CBioseq_set::eClass_genbank);
11150     entry->SetSet().SetSeq_set().push_back(set);
11151 
11152     STANDARD_SETUP
11153 
11154     expected_errors.push_back(new CExpectedError("", eDiag_Warning, "ImproperlyNestedSets",
11155                                                  "Nested sets within Pop/Phy/Mut/Eco/Wgs set"));
11156     //AddChromosomeNoLocation(expected_errors, entry);
11157 
11158     TESTPOPPHYMUTECO (seh, entry)
11159 
11160     CLEAR_ERRORS
11161     expected_errors.push_back(new CExpectedError("", eDiag_Warning, "ImproperlyNestedSets",
11162                                                  "Nested sets within Pop/Phy/Mut/Eco/Wgs set"));
11163     //AddChromosomeNoLocation(expected_errors, entry);
11164     scope.RemoveTopLevelSeqEntry(seh);
11165     unit_test_util::RemoveDescriptorType(entry, CSeqdesc::e_Title);
11166     seh = scope.AddTopLevelSeqEntry(*entry);
11167 
11168     TESTWGS (seh, entry);
11169 
11170     CLEAR_ERRORS
11171 }
11172 
11173 
BOOST_AUTO_TEST_CASE(Test_PKG_ConSetProblem)11174 BOOST_AUTO_TEST_CASE(Test_PKG_ConSetProblem)
11175 {
11176     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
11177     entry->SetSet().SetClass(CBioseq_set::eClass_conset);
11178     unit_test_util::RemoveDescriptorType(entry, CSeqdesc::e_Title);
11179 
11180     STANDARD_SETUP
11181 
11182     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "ConSetProblem",
11183                                                  "Set class should not be conset"));
11184     //AddChromosomeNoLocation(expected_errors, entry);
11185     eval = validator.Validate(seh, options);
11186     CheckErrors (*eval, expected_errors);
11187 
11188     CLEAR_ERRORS
11189 }
11190 
11191 
BOOST_AUTO_TEST_CASE(Test_PKG_NoBioseqFound)11192 BOOST_AUTO_TEST_CASE(Test_PKG_NoBioseqFound)
11193 {
11194     CRef<CSeq_entry> entry(new CSeq_entry());
11195     entry->SetSet().SetClass(CBioseq_set::eClass_eco_set);
11196 
11197     STANDARD_SETUP
11198 
11199     expected_errors.push_back(new CExpectedError("", eDiag_Error, "NoBioseqFound",
11200                                                  "No Bioseqs in this entire record."));
11201     //AddChromosomeNoLocation(expected_errors, entry);
11202     eval = validator.Validate(seh, options);
11203     CheckErrors (*eval, expected_errors);
11204 
11205     CLEAR_ERRORS
11206 }
11207 
11208 
BOOST_AUTO_TEST_CASE(Test_PKG_INSDRefSeqPackaging)11209 BOOST_AUTO_TEST_CASE(Test_PKG_INSDRefSeqPackaging)
11210 {
11211     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
11212     entry->SetSet().SetSeq_set().front()->SetSeq().SetId().front()->SetEmbl().SetAccession("EA123456");
11213     entry->SetSet().SetSeq_set().back()->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
11214 
11215     STANDARD_SETUP
11216 
11217     expected_errors.push_back(new CExpectedError("emb|EA123456|", eDiag_Error, "INSDRefSeqPackaging",
11218                                                  "INSD and RefSeq records should not be present in the same set"));
11219     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "NoOrganismInTitle",
11220                                                  "RefSeq nucleotide title does not start with organism name"));
11221     expected_errors.push_back(new CExpectedError("emb|EA123456|", eDiag_Warning, "ComponentMissingTitle",
11222                               "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title"));
11223     expected_errors.push_back(new CExpectedError("lcl|good2", eDiag_Warning, "ComponentMissingTitle",
11224                               "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title"));
11225     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "ComponentMissingTitle",
11226                               "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title"));
11227     //AddChromosomeNoLocation(expected_errors, entry);
11228 
11229     eval = validator.Validate(seh, options);
11230     CheckErrors (*eval, expected_errors);
11231 
11232     CLEAR_ERRORS
11233 }
11234 
11235 
BOOST_AUTO_TEST_CASE(Test_PKG_GPSnonGPSPackaging)11236 BOOST_AUTO_TEST_CASE(Test_PKG_GPSnonGPSPackaging)
11237 {
11238     CRef<CSeq_entry> entry(new CSeq_entry());
11239     entry->SetSet().SetClass(CBioseq_set::eClass_genbank);
11240     entry->SetSet().SetSeq_set().push_back(unit_test_util::BuildGoodEcoSet());
11241     entry->SetSet().SetSeq_set().push_back(unit_test_util::BuildGoodGenProdSet());
11242 
11243     WriteOutTemp (entry);
11244     STANDARD_SETUP
11245 
11246     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "GPSnonGPSPackaging",
11247                                                  "Genomic product set and mut/pop/phy/eco set records should not be present in the same set"));
11248     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "InconsistentMoltypeSet",
11249                                                  "Pop/phy/mut/eco set contains inconsistent moltype"));
11250     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "ImproperlyNestedSets",
11251                                                  "Nested sets within Pop/Phy/Mut/Eco/Wgs set"));
11252     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ImproperlyNestedSets",
11253                                                  "Nested sets within Pop/Phy/Mut/Eco/Wgs set"));
11254     //AddChromosomeNoLocation(expected_errors, entry);
11255 
11256 
11257     TESTPOPPHYMUTECO (seh, entry)
11258 
11259     CLEAR_ERRORS
11260     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "GPSnonGPSPackaging",
11261                                                  "Genomic product set and mut/pop/phy/eco set records should not be present in the same set"));
11262     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "InconsistentMoltypeSet",
11263                                                  "Pop/phy/mut/eco set contains inconsistent moltype"));
11264     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "ImproperlyNestedSets",
11265                                                  "Nested sets within Pop/Phy/Mut/Eco/Wgs set"));
11266     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ImproperlyNestedSets",
11267                                                  "Nested sets within Pop/Phy/Mut/Eco/Wgs set"));
11268     //AddChromosomeNoLocation(expected_errors, entry);
11269 
11270     TESTWGS (seh, entry);
11271 
11272     CLEAR_ERRORS
11273 }
11274 
11275 
BOOST_AUTO_TEST_CASE(Test_PKG_RefSeqPopSet)11276 BOOST_AUTO_TEST_CASE(Test_PKG_RefSeqPopSet)
11277 {
11278     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
11279     entry->SetSet().SetClass(CBioseq_set::eClass_pop_set);
11280     entry->SetSet().SetSeq_set().front()->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
11281 
11282     STANDARD_SETUP
11283 
11284     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "NoOrganismInTitle",
11285                                                  "RefSeq nucleotide title does not start with organism name"));
11286     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Critical, "RefSeqPopSet",
11287                                                  "RefSeq record should not be a Pop-set"));
11288     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "ComponentMissingTitle",
11289                               "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title"));
11290     expected_errors.push_back(new CExpectedError("lcl|good2", eDiag_Warning, "ComponentMissingTitle",
11291                               "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title"));
11292     expected_errors.push_back(new CExpectedError("lcl|good3", eDiag_Warning, "ComponentMissingTitle",
11293                               "Nucleotide component of pop/phy/mut/eco/wgs set is missing its title"));
11294     //AddChromosomeNoLocation(expected_errors, entry);
11295     eval = validator.Validate(seh, options);
11296     CheckErrors (*eval, expected_errors);
11297 
11298     CLEAR_ERRORS
11299 }
11300 
11301 
BOOST_AUTO_TEST_CASE(Test_PKG_BioseqSetClassNotSet)11302 BOOST_AUTO_TEST_CASE(Test_PKG_BioseqSetClassNotSet)
11303 {
11304     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
11305     entry->SetSet().SetClass(CBioseq_set::eClass_not_set);
11306     unit_test_util::RemoveDescriptorType(entry, CSeqdesc::e_Title);
11307 
11308     STANDARD_SETUP
11309 
11310     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "BioseqSetClassNotSet",
11311                                                  "Bioseq_set class not set"));
11312     //AddChromosomeNoLocation(expected_errors, entry);
11313     eval = validator.Validate(seh, options);
11314     CheckErrors (*eval, expected_errors);
11315 
11316     CLEAR_ERRORS
11317 }
11318 
11319 
BOOST_AUTO_TEST_CASE(Test_PKG_OrphanedProtein)11320 BOOST_AUTO_TEST_CASE(Test_PKG_OrphanedProtein)
11321 {
11322     CRef<CSeq_entry> entry = unit_test_util::BuildGoodProtSeq();
11323     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AYZ12345");
11324     entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetGenbank().SetAccession("AYZ12345");
11325 
11326     STANDARD_SETUP
11327 
11328     expected_errors.push_back(new CExpectedError("gb|AYZ12345|", eDiag_Error, "OrphanedProtein",
11329                                                  "Orphaned stand-alone protein"));
11330     //AddChromosomeNoLocation(expected_errors, entry);
11331     eval = validator.Validate(seh, options);
11332     CheckErrors (*eval, expected_errors);
11333     set< CBioseq_Handle > orphans = validator::ListOrphanProteins(seh);
11334     BOOST_CHECK_EQUAL(orphans.size(), 1);
11335 
11336     scope.RemoveTopLevelSeqEntry(seh);
11337     entry->SetSeq().SetId().front()->SetEmbl().SetAccession("AQZ12345");
11338     entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetEmbl().SetAccession("AQZ12345");
11339     seh = scope.AddTopLevelSeqEntry(*entry);
11340     eval = validator.Validate(seh, options);
11341     ChangeErrorAcc(expected_errors, "emb|AQZ12345|");
11342     CheckErrors (*eval, expected_errors);
11343 
11344     scope.RemoveTopLevelSeqEntry(seh);
11345     entry->SetSeq().SetId().front()->SetDdbj().SetAccession("ARZ12345");
11346     entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetDdbj().SetAccession("ARZ12345");
11347     seh = scope.AddTopLevelSeqEntry(*entry);
11348     eval = validator.Validate(seh, options);
11349     ChangeErrorAcc(expected_errors, "dbj|ARZ12345|");
11350     CheckErrors (*eval, expected_errors);
11351 
11352     scope.RemoveTopLevelSeqEntry(seh);
11353     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
11354     entry->SetSeq().SetAnnot().front()->SetData().SetFtable().front()->SetLocation().SetInt().SetId().SetOther().SetAccession("NC_123456");
11355     seh = scope.AddTopLevelSeqEntry(*entry);
11356     ChangeErrorAcc(expected_errors, "ref|NC_123456|");
11357     eval = validator.Validate(seh, options);
11358     CheckErrors (*eval, expected_errors);
11359 
11360     CLEAR_ERRORS
11361 }
11362 
11363 
BOOST_AUTO_TEST_CASE(Test_PKG_MisplacedMolInfo)11364 BOOST_AUTO_TEST_CASE(Test_PKG_MisplacedMolInfo)
11365 {
11366     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
11367     CRef<CSeqdesc> molinfo(new CSeqdesc());
11368     molinfo->SetMolinfo().SetTech(CMolInfo::eTech_wgs);
11369     entry->SetSet().SetDescr().Set().push_back(molinfo);
11370 
11371     STANDARD_SETUP
11372 
11373     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "MisplacedMolInfo",
11374             "Nuc-prot set has MolInfo on set"));
11375     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "HTGS_STS_GSS_WGSshouldBeGenomic",
11376         "HTGS/STS/GSS/WGS sequence should be genomic"));
11377     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "NucleotideTechniqueOnProtein",
11378         "Protein with nucleic acid sequence method"));
11379     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "HTGS_STS_GSS_WGSshouldBeGenomic",
11380         "HTGS/STS/GSS/WGS sequence should be genomic"));
11381     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "InconsistentMolInfo",
11382         "Inconsistent Molinfo-completeness [1] and [0]"));
11383     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "MoltypeUnknown",
11384         "Molinfo-biomol unknown used"));
11385     //AddChromosomeNoLocation(expected_errors, entry);
11386 
11387     eval = validator.Validate(seh, options);
11388     CheckErrors (*eval, expected_errors);
11389 
11390     CLEAR_ERRORS
11391 }
11392 
11393 
BOOST_AUTO_TEST_CASE(Test_PKG_ImproperlyNestedSets)11394 BOOST_AUTO_TEST_CASE(Test_PKG_ImproperlyNestedSets)
11395 {
11396     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
11397 
11398     STANDARD_SETUP
11399 
11400     // no error first
11401 
11402     //AddChromosomeNoLocation(expected_errors, entry);
11403     eval = validator.Validate(seh, options);
11404     CheckErrors (*eval, expected_errors);
11405 
11406     // insert nested set
11407     scope.RemoveTopLevelSeqEntry(seh);
11408     entry->SetSet().SetSeq_set().clear();
11409     entry->SetSet().SetSeq_set().push_back (unit_test_util::BuildGoodEcoSet());
11410     seh = scope.AddTopLevelSeqEntry(*entry);
11411 
11412     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "SingleItemSet",
11413                               "Pop/Phy/Mut/Eco set has only one component and no alignments"));
11414     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "ImproperlyNestedSets",
11415                                                  "Nested sets within Pop/Phy/Mut/Eco/Wgs set"));
11416     eval = validator.Validate(seh, options);
11417     CheckErrors (*eval, expected_errors);
11418 
11419     CLEAR_ERRORS
11420 }
11421 
11422 
BOOST_AUTO_TEST_CASE(Test_FEAT_InvalidForType)11423 BOOST_AUTO_TEST_CASE(Test_FEAT_InvalidForType)
11424 {
11425     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
11426     CRef<CSeq_feat> feat(new CSeq_feat());
11427     feat->SetLocation().SetInt().SetFrom(0);
11428     feat->SetLocation().SetInt().SetTo(5);
11429     feat->SetLocation().SetInt().SetId().SetLocal().SetStr("prot");
11430     feat->SetData().SetCdregion();
11431     feat->SetPseudo(true);
11432     unit_test_util::AddFeat (feat, entry->SetSet().SetSeq_set().back());
11433 
11434     STANDARD_SETUP
11435 
11436     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "InvalidFeatureForProtein",
11437                                                  "Invalid feature for a protein Bioseq."));
11438     //AddChromosomeNoLocation(expected_errors, entry);
11439     eval = validator.Validate(seh, options);
11440     CheckErrors (*eval, expected_errors);
11441 
11442     scope.RemoveTopLevelSeqEntry(seh);
11443     feat->SetData().SetRna();
11444     feat->SetData().SetRna().SetType(CRNA_ref::eType_miscRNA);
11445     seh = scope.AddTopLevelSeqEntry(*entry);
11446     eval = validator.Validate(seh, options);
11447     CheckErrors (*eval, expected_errors);
11448 
11449     scope.RemoveTopLevelSeqEntry(seh);
11450     feat->SetData().SetRsite();
11451     seh = scope.AddTopLevelSeqEntry(*entry);
11452     eval = validator.Validate(seh, options);
11453     CheckErrors (*eval, expected_errors);
11454 
11455     scope.RemoveTopLevelSeqEntry(seh);
11456     feat->SetData().SetTxinit();
11457     seh = scope.AddTopLevelSeqEntry(*entry);
11458     eval = validator.Validate(seh, options);
11459     CheckErrors (*eval, expected_errors);
11460 
11461     scope.RemoveTopLevelSeqEntry(seh);
11462     feat->SetData().SetGene().SetLocus("good locus");
11463     seh = scope.AddTopLevelSeqEntry(*entry);
11464     eval = validator.Validate(seh, options);
11465     CheckErrors (*eval, expected_errors);
11466     CLEAR_ERRORS
11467 
11468     scope.RemoveTopLevelSeqEntry(seh);
11469     entry->SetSet().SetSeq_set().back()->SetSeq().SetAnnot().front()->SetData().SetFtable().pop_back();
11470     feat->SetLocation().SetInt().SetId().SetLocal().SetStr("nuc");
11471     feat->SetData().SetProt().SetName().push_back("prot name");
11472     unit_test_util::AddFeat(feat, entry->SetSet().SetSeq_set().front());
11473     seh = scope.AddTopLevelSeqEntry(*entry);
11474     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error,
11475         "InvalidFeatureForNucleotide", "Invalid feature for a nucleotide Bioseq."));
11476     //AddChromosomeNoLocation(expected_errors, entry);
11477     eval = validator.Validate(seh, options);
11478     CheckErrors (*eval, expected_errors);
11479 
11480     scope.RemoveTopLevelSeqEntry(seh);
11481     feat->SetData().SetPsec_str();
11482     seh = scope.AddTopLevelSeqEntry(*entry);
11483     eval = validator.Validate(seh, options);
11484     CheckErrors (*eval, expected_errors);
11485     CLEAR_ERRORS
11486 
11487     scope.RemoveTopLevelSeqEntry(seh);
11488     entry = unit_test_util::BuildGoodSeq();
11489     entry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
11490     unit_test_util::SetBiomol(entry, CMolInfo::eBiomol_mRNA);
11491     CRef<CSeq_loc> loc1(new CSeq_loc());
11492     loc1->SetInt().SetFrom(0);
11493     loc1->SetInt().SetTo(10);
11494     loc1->SetInt().SetId().Assign(*(entry->SetSeq().GetId().front()));
11495     CRef<CSeq_loc> loc2(new CSeq_loc());
11496     loc2->SetInt().SetFrom(21);
11497     loc2->SetInt().SetTo(35);
11498     loc2->SetInt().SetId().Assign(*(entry->SetSeq().GetId().front()));
11499     CRef<CSeq_feat> cds(new CSeq_feat());
11500     cds->SetLocation().SetMix().Set().push_back(loc1);
11501     cds->SetLocation().SetMix().Set().push_back(loc2);
11502     cds->SetData().SetCdregion();
11503     cds->SetPseudo(true);
11504     unit_test_util::AddFeat(cds, entry);
11505     seh = scope.AddTopLevelSeqEntry(*entry);
11506     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
11507         "InvalidForType", "Multi-interval CDS feature is invalid on an mRNA (cDNA) Bioseq."));
11508     //AddChromosomeNoLocation(expected_errors, entry);
11509     eval = validator.Validate(seh, options);
11510     CheckErrors (*eval, expected_errors);
11511 
11512     // different warning level if RefSeq
11513     scope.RemoveTopLevelSeqEntry(seh);
11514     CRef<CSeq_id> rsid(new CSeq_id());
11515     rsid->SetOther().SetAccession("NY_123456");
11516     unit_test_util::ChangeId(entry, rsid);
11517     seh = scope.AddTopLevelSeqEntry(*entry);
11518     expected_errors[0]->SetSeverity(eDiag_Warning);
11519     ChangeErrorAcc(expected_errors, "ref|NY_123456|");
11520     eval = validator.Validate(seh, options);
11521     CheckErrors (*eval, expected_errors);
11522 
11523     scope.RemoveTopLevelSeqEntry(seh);
11524     CRef<CSeq_id> good_id(new CSeq_id());
11525     good_id->SetLocal().SetStr("good");
11526     unit_test_util::ChangeId(entry, good_id);
11527     cds->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
11528     seh = scope.AddTopLevelSeqEntry(*entry);
11529     ChangeErrorAcc(expected_errors, "lcl|good");
11530     expected_errors[0]->SetErrCode("CDSmRNAMismatchLocation");
11531     expected_errors[0]->SetSeverity(eDiag_Warning);
11532     expected_errors[0]->SetErrMsg("No CDS location match for 1 mRNA");
11533 
11534     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFeatureForMRNA",
11535                                                  "mRNA feature is invalid on an mRNA (cDNA) Bioseq."));
11536     eval = validator.Validate(seh, options);
11537     CheckErrors (*eval, expected_errors);
11538     CLEAR_ERRORS
11539 
11540     scope.RemoveTopLevelSeqEntry(seh);
11541     cds->SetData().SetImp().SetKey("intron");
11542     cds->SetLocation().SetInt().SetFrom(0);
11543     cds->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength() - 1);
11544     cds->SetLocation().SetInt().SetId().Assign(*(entry->GetSeq().GetId().front()));
11545     seh = scope.AddTopLevelSeqEntry(*entry);
11546     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidForType",
11547                                                  "Invalid feature for an mRNA Bioseq."));
11548     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info,
11549         "NotSpliceConsensusDonorTerminalIntron",
11550         "Splice donor consensus (GT) not found at start of terminal intron, position 1 of lcl|good"));
11551     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info,
11552         "NotSpliceConsensusAcceptorTerminalIntron",
11553         "Splice acceptor consensus (AG) not found at end of terminal intron, position 60 of lcl|good, but at end of sequence"));
11554     //AddChromosomeNoLocation(expected_errors, entry);
11555     eval = validator.Validate(seh, options);
11556     CheckErrors (*eval, expected_errors);
11557     CLEAR_ERRORS
11558 
11559     vector<string> peptide_feat;
11560     peptide_feat.push_back("mat_peptide");
11561     peptide_feat.push_back("sig_peptide");
11562     peptide_feat.push_back("transit_peptide");
11563     peptide_feat.push_back("preprotein");
11564     peptide_feat.push_back("proprotein");
11565 
11566     scope.RemoveTopLevelSeqEntry(seh);
11567     entry = unit_test_util::BuildGoodNucProtSet();
11568     CRef<CSeq_feat> imp(new CSeq_feat());
11569     imp->SetLocation().SetInt().SetFrom(0);
11570     imp->SetLocation().SetInt().SetTo(5);
11571     imp->SetLocation().SetInt().SetId().SetLocal().SetStr("prot");
11572     unit_test_util::AddFeat(imp, entry->SetSet().SetSeq_set().back());
11573     seh = scope.AddTopLevelSeqEntry(*entry);
11574 
11575     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "PeptideFeatureLacksCDS",
11576         "Peptide processing feature should be converted to the appropriate protein feature subtype"));
11577     //AddChromosomeNoLocation(expected_errors, entry);
11578     CRef<CSeq_id> local_id(new CSeq_id());
11579     local_id->SetLocal().SetStr("good");
11580     ITERATE(vector<string>, key, peptide_feat) {
11581         scope.RemoveTopLevelSeqEntry(seh);
11582         unit_test_util::ChangeProtId(entry, local_id);
11583         imp->SetData().SetImp().SetKey(*key);
11584         seh = scope.AddTopLevelSeqEntry(*entry);
11585         expected_errors[0]->SetAccession("lcl|good");
11586         expected_errors[0]->SetSeverity(eDiag_Warning);
11587         eval = validator.Validate(seh, options);
11588         CheckErrors (*eval, expected_errors);
11589 
11590         scope.RemoveTopLevelSeqEntry(seh);
11591         unit_test_util::ChangeProtId(entry, rsid);
11592         imp->SetData().SetImp().SetKey(*key);
11593         seh = scope.AddTopLevelSeqEntry(*entry);
11594         expected_errors[0]->SetAccession("ref|NY_123456|");
11595         expected_errors[0]->SetSeverity(eDiag_Error);
11596         eval = validator.Validate(seh, options);
11597         CheckErrors (*eval, expected_errors);
11598     }
11599 
11600     vector<string> rna_feat;
11601     rna_feat.push_back("mRNA");
11602     rna_feat.push_back("tRNA");
11603     rna_feat.push_back("rRNA");
11604     rna_feat.push_back("snRNA");
11605     rna_feat.push_back("scRNA");
11606     rna_feat.push_back("snoRNA");
11607     rna_feat.push_back("misc_RNA");
11608     rna_feat.push_back("precursor_RNA");
11609 
11610     scope.RemoveTopLevelSeqEntry(seh);
11611     entry = unit_test_util::BuildGoodSeq();
11612     seh = scope.AddTopLevelSeqEntry(*entry);
11613 
11614     expected_errors[0]->SetErrCode("InvalidRNAFeature");
11615     expected_errors[0]->SetErrMsg("RNA feature should be converted to the appropriate RNA feature subtype, location should be converted manually");
11616     expected_errors[0]->SetSeverity(eDiag_Error);
11617     ChangeErrorAcc(expected_errors, "lcl|good");
11618     ITERATE(vector<string>, key, rna_feat) {
11619         scope.RemoveTopLevelSeqEntry(seh);
11620         entry->SetSeq().ResetAnnot();
11621         CRef<CSeq_feat> rna = unit_test_util::AddMiscFeature(entry);
11622         rna->SetData().SetImp().SetKey(*key);
11623         seh = scope.AddTopLevelSeqEntry(*entry);
11624         eval = validator.Validate(seh, options);
11625         CheckErrors (*eval, expected_errors);
11626     }
11627 
11628     vector<CProt_ref::TProcessed> prot_types;
11629     prot_types.push_back(CProt_ref::eProcessed_mature);
11630     prot_types.push_back(CProt_ref::eProcessed_transit_peptide);
11631     prot_types.push_back(CProt_ref::eProcessed_signal_peptide);
11632     prot_types.push_back(CProt_ref::eProcessed_preprotein);
11633 
11634     entry->SetSeq().ResetAnnot();
11635     CRef<CSeq_feat> prot(new CSeq_feat());
11636     prot->SetLocation().SetInt().SetFrom(0);
11637     prot->SetLocation().SetInt().SetTo(10);
11638     prot->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
11639     prot->SetData().SetProt().SetName().push_back("unnamed");
11640     unit_test_util::AddFeat(prot, entry);
11641 
11642     CLEAR_ERRORS
11643     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
11644         "InvalidFeatureForNucleotide", "Invalid feature for a nucleotide Bioseq."));
11645     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InvalidForType",
11646         "Peptide processing feature should be remapped to the appropriate protein bioseq"));
11647     //AddChromosomeNoLocation(expected_errors, entry);
11648     ITERATE(vector<CProt_ref::TProcessed>, key, prot_types) {
11649         scope.RemoveTopLevelSeqEntry(seh);
11650         unit_test_util::ChangeId(entry, local_id);
11651         prot->SetData().SetProt().SetProcessed(*key);
11652         seh = scope.AddTopLevelSeqEntry(*entry);
11653         ChangeErrorAcc(expected_errors, "lcl|good");
11654         expected_errors[1]->SetSeverity(eDiag_Warning);
11655         eval = validator.Validate(seh, options);
11656         CheckErrors (*eval, expected_errors);
11657 
11658         scope.RemoveTopLevelSeqEntry(seh);
11659         unit_test_util::ChangeId(entry, rsid);
11660         prot->SetData().SetProt().SetProcessed(*key);
11661         seh = scope.AddTopLevelSeqEntry(*entry);
11662         ChangeErrorAcc(expected_errors, "ref|NY_123456|");
11663         expected_errors[1]->SetSeverity(eDiag_Error);
11664         expected_errors.push_back(new CExpectedError("ref|NY_123456|", eDiag_Warning, "UndesiredProteinName",
11665                                                      "Uninformative protein name 'unnamed'"));
11666         eval = validator.Validate(seh, options);
11667         CheckErrors (*eval, expected_errors);
11668         delete expected_errors[2];
11669         expected_errors.pop_back();
11670     }
11671 
11672     CLEAR_ERRORS
11673 }
11674 
11675 
MakeStructuredCommentField(const string & label,const string & value)11676 CRef<CUser_field> MakeStructuredCommentField(const string& label, const string& value)
11677 {
11678     CRef<CUser_field> field(new CUser_field());
11679     field->SetLabel().SetStr(label);
11680     field->SetData().SetStr(value);
11681     return field;
11682 }
11683 
11684 
BOOST_AUTO_TEST_CASE(Test_VR_828)11685 BOOST_AUTO_TEST_CASE(Test_VR_828)
11686 {
11687     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
11688     CRef<CSeq_entry> prot = GetProteinSequenceFromGoodNucProtSet(entry);
11689     CRef<CSeq_feat> gene = AddMiscFeature(prot, prot->GetSeq().GetLength() - 1);
11690     gene->SetData().SetGene().SetLocus("x");
11691 
11692     CRef<CSeqdesc> pgap(new CSeqdesc());
11693     pgap->SetUser().SetObjectType(CUser_object::eObjectType_StructuredComment);
11694     pgap->SetUser().SetData().push_back(MakeStructuredCommentField("StructuredCommentPrefix", "##Genome-Annotation-Data-START##"));
11695     pgap->SetUser().SetData().push_back(MakeStructuredCommentField("Annotation Provider", "NCBI"));
11696     entry->SetSet().SetDescr().Set().push_back(pgap);
11697 
11698     STANDARD_SETUP
11699 
11700     //AddChromosomeNoLocation(expected_errors, entry);
11701     eval = validator.Validate(seh, options);
11702     CheckErrors (*eval, expected_errors);
11703 
11704     CLEAR_ERRORS
11705 }
11706 
11707 
BOOST_AUTO_TEST_CASE(Test_VR_829)11708 BOOST_AUTO_TEST_CASE(Test_VR_829)
11709 {
11710     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
11711     CRef<CSeq_entry> prot = GetProteinSequenceFromGoodNucProtSet(entry);
11712     CRef<CSeq_feat> gene = AddMiscFeature(prot, prot->GetSeq().GetLength() - 1);
11713     gene->SetData().SetGene().SetLocus_tag("x");
11714 
11715     CRef<CSeqdesc> pgap(new CSeqdesc());
11716     pgap->SetUser().SetObjectType(CUser_object::eObjectType_StructuredComment);
11717     pgap->SetUser().SetData().push_back(MakeStructuredCommentField("StructuredCommentPrefix", "##Genome-Annotation-Data-START##"));
11718     pgap->SetUser().SetData().push_back(MakeStructuredCommentField("Annotation Provider", "NCBI"));
11719     entry->SetSet().SetDescr().Set().push_back(pgap);
11720 
11721     STANDARD_SETUP
11722 
11723     eval = validator.Validate(seh, options);
11724 
11725     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "LocusTagProblem",
11726         "Genes on protein sequences with PGAP annotation should not have locus tags."));
11727     //AddChromosomeNoLocation(expected_errors, entry);
11728     CheckErrors (*eval, expected_errors);
11729     CLEAR_ERRORS
11730 }
11731 
11732 
BuildGoodSpliceNucProtSet(void)11733 static CRef<CSeq_entry> BuildGoodSpliceNucProtSet (void)
11734 {
11735     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet ();
11736     CRef<CSeq_entry> nseq = entry->SetSet().SetSeq_set().front();
11737     CRef<CSeq_entry> pseq = entry->SetSet().SetSeq_set().back();
11738     CRef<CSeq_feat>  cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
11739     CRef<CSeq_feat>  prot = pseq->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
11740 
11741     nseq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGGTATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
11742 
11743     cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nseq->SetSeq().SetId().front()));
11744 #if 0
11745     CRef<CSeq_loc> loc1(new CSeq_loc());
11746     loc1->SetInt().SetId().SetLocal().SetStr("nuc");
11747     loc1->SetInt().SetFrom(0);
11748     loc1->SetInt().SetTo(15);
11749 
11750     CRef<CSeq_loc> loc2(new CSeq_loc());
11751     loc2->SetInt().SetId().SetLocal().SetStr("nuc");
11752     loc2->SetInt().SetFrom(46);
11753     loc2->SetInt().SetTo(56);
11754 
11755     cds->SetLocation().SetMix().Set().push_back(loc1);
11756     cds->SetLocation().SetMix().Set().push_back(loc2);
11757 #endif
11758 
11759     return entry;
11760 }
11761 
11762 
BOOST_AUTO_TEST_CASE(Test_FEAT_PartialProblem)11763 BOOST_AUTO_TEST_CASE(Test_FEAT_PartialProblem)
11764 {
11765     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
11766     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
11767     cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
11768     cds->SetPartial(true);
11769     unit_test_util::SetCompleteness (entry->SetSet().SetSeq_set().back(), CMolInfo::eCompleteness_complete);
11770     CRef<CSeq_entry> nuc_seq = entry->SetSet().SetSeq_set().front();
11771     CRef<CSeq_entry> prot_seq = entry->SetSet().SetSeq_set().back();
11772 
11773     STANDARD_SETUP
11774 
11775     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
11776                                                  "Coding region and protein feature partials conflict"));
11777     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
11778                                                  "Inconsistent: Product= complete, Location= partial, Feature.partial= TRUE"));
11779     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
11780                                                  "CDS is partial but protein is complete"));
11781     //AddChromosomeNoLocation(expected_errors, entry);
11782     // cds 5' partial, protein complete
11783     eval = validator.Validate(seh, options);
11784     CheckErrors (*eval, expected_errors);
11785     CLEAR_ERRORS
11786 
11787     // cds 5' complete, protein 5' partial
11788     cds->SetLocation().SetPartialStart(false, eExtreme_Biological);
11789     cds->SetPartial(false);
11790     unit_test_util::SetCompleteness (prot_seq, CMolInfo::eCompleteness_no_left);
11791     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
11792                                                  "Inconsistent: Product= partial, Location= complete, Feature.partial= FALSE"));
11793     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
11794                                                  "CDS is 5' complete but protein is NH2 partial"));
11795     //AddChromosomeNoLocation(expected_errors, entry);
11796     eval = validator.Validate(seh, options);
11797     CheckErrors (*eval, expected_errors);
11798     CLEAR_ERRORS
11799 
11800     // cds 5' partial, protein 3' partial
11801     cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
11802     cds->SetPartial(true);
11803     unit_test_util::SetCompleteness (prot_seq, CMolInfo::eCompleteness_no_right);
11804     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
11805                                                  "Coding region and protein feature partials conflict"));
11806     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
11807                                                  "Got stop codon, but 3'end is labeled partial"));
11808     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
11809                                                  "CDS is 3' complete but protein is CO2 partial"));
11810     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblem",
11811                                                  "CDS is 5' partial but protein is CO2 partial"));
11812     //AddChromosomeNoLocation(expected_errors, entry);
11813     eval = validator.Validate(seh, options);
11814     CheckErrors (*eval, expected_errors);
11815     CLEAR_ERRORS
11816 
11817     // cds 3' partial, protein 5' partial
11818     cds->SetLocation().SetPartialStart(false, eExtreme_Biological);
11819     cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
11820     unit_test_util::SetCompleteness (prot_seq, CMolInfo::eCompleteness_no_left);
11821     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
11822                                                  "Coding region and protein feature partials conflict"));
11823     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus3Prime",
11824                                                  "3' partial is not at end of sequence, gap, or consensus splice site"));
11825     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
11826                                                  "Got stop codon, but 3'end is labeled partial"));
11827     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
11828                                                  "CDS is 5' complete but protein is NH2 partial"));
11829     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
11830                                                  "CDS is 3' partial but protein is NH2 partial"));
11831     //AddChromosomeNoLocation(expected_errors, entry);
11832     eval = validator.Validate(seh, options);
11833     CheckErrors (*eval, expected_errors);
11834     CLEAR_ERRORS
11835 
11836     // cds 5' partial, protein no ends
11837     cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
11838     cds->SetLocation().SetPartialStop(false, eExtreme_Biological);
11839     unit_test_util::SetCompleteness (prot_seq, CMolInfo::eCompleteness_no_ends);
11840     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
11841                                                  "Coding region and protein feature partials conflict"));
11842     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
11843                                                  "Got stop codon, but 3'end is labeled partial"));
11844     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblem",
11845                                                  "CDS is 5' partial but protein has neither end"));
11846     //AddChromosomeNoLocation(expected_errors, entry);
11847     eval = validator.Validate(seh, options);
11848     CheckErrors (*eval, expected_errors);
11849     CLEAR_ERRORS
11850 
11851     // cds 3' partial, protein no ends
11852     cds->SetLocation().SetPartialStart(false, eExtreme_Biological);
11853     cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
11854     unit_test_util::SetCompleteness (prot_seq, CMolInfo::eCompleteness_no_ends);
11855     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
11856                                                  "Coding region and protein feature partials conflict"));
11857     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus3Prime",
11858                                                  "3' partial is not at end of sequence, gap, or consensus splice site"));
11859     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
11860                                                  "Got stop codon, but 3'end is labeled partial"));
11861     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
11862                                                  "CDS is 3' partial but protein has neither end"));
11863     //AddChromosomeNoLocation(expected_errors, entry);
11864     eval = validator.Validate(seh, options);
11865     CheckErrors (*eval, expected_errors);
11866     CLEAR_ERRORS
11867 
11868     // cds complete, protein no ends
11869     cds->SetLocation().SetPartialStart(false, eExtreme_Biological);
11870     cds->SetLocation().SetPartialStop(false, eExtreme_Biological);
11871     cds->SetPartial(false);
11872     unit_test_util::SetCompleteness (prot_seq, CMolInfo::eCompleteness_no_ends);
11873     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
11874         "Inconsistent: Product= partial, Location= complete, Feature.partial= FALSE"));
11875     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
11876                                                  "Got stop codon, but 3'end is labeled partial"));
11877     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
11878                                                  "CDS is complete but protein has neither end"));
11879     //AddChromosomeNoLocation(expected_errors, entry);
11880     eval = validator.Validate(seh, options);
11881     CheckErrors (*eval, expected_errors);
11882     CLEAR_ERRORS
11883 
11884     // misc feature with location whole but not marked partial
11885     scope.RemoveTopLevelSeqEntry(seh);
11886     unit_test_util::SetCompleteness (prot_seq, CMolInfo::eCompleteness_complete);
11887     unit_test_util::SetCompleteness (nuc_seq, CMolInfo::eCompleteness_no_left);
11888     CRef<CSeq_feat> misc_feat = unit_test_util::AddMiscFeature (nuc_seq);
11889     misc_feat->SetLocation().SetWhole().SetLocal().SetStr("nuc");
11890     seh = scope.AddTopLevelSeqEntry(*entry);
11891     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "WholeLocation",
11892         "Feature may not have whole location"));
11893     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "PartialProblem",
11894         "On partial Bioseq, SeqFeat.partial should be TRUE"));
11895     //AddChromosomeNoLocation(expected_errors, entry);
11896     eval = validator.Validate(seh, options);
11897     CheckErrors (*eval, expected_errors);
11898     CLEAR_ERRORS
11899 
11900     scope.RemoveTopLevelSeqEntry(seh);
11901     unit_test_util::SetCompleteness (nuc_seq, CMolInfo::eCompleteness_unknown);
11902     unit_test_util::SetCompleteness (prot_seq, CMolInfo::eCompleteness_no_left);
11903     cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
11904     cds->SetLocation().SetPartialStop(false, eExtreme_Biological);
11905     cds->SetPartial(true);
11906     nuc_seq->SetSeq().SetAnnot().front()->SetData().SetFtable().pop_back();
11907     misc_feat = unit_test_util::AddMiscFeature (nuc_seq);
11908     misc_feat->SetPartial(true);
11909     misc_feat->SetProduct().SetWhole().SetLocal().SetStr("prot");
11910     seh = scope.AddTopLevelSeqEntry(*entry);
11911     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
11912                                                  "Coding region and protein feature partials conflict"));
11913     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblem",
11914         "When SeqFeat.product is a partial Bioseq, SeqFeat.location should also be partial"));
11915     //AddChromosomeNoLocation(expected_errors, entry);
11916     eval = validator.Validate(seh, options);
11917     CheckErrors (*eval, expected_errors);
11918     CLEAR_ERRORS
11919 
11920     scope.RemoveTopLevelSeqEntry(seh);
11921     nuc_seq->SetSeq().ResetAnnot();
11922     CRef<CSeq_loc> first(new CSeq_loc());
11923     first->SetInt().SetId().SetLocal().SetStr("nuc");
11924     first->SetInt().SetFrom(0);
11925     first->SetInt().SetTo(5);
11926     CRef<CSeq_loc> middle(new CSeq_loc());
11927     middle->SetNull();
11928     CRef<CSeq_loc> last(new CSeq_loc());
11929     last->SetInt().SetId().SetLocal().SetStr("nuc");
11930     last->SetInt().SetFrom(7);
11931     last->SetInt().SetTo(10);
11932 
11933     CRef<CSeq_feat> gene_feat(new CSeq_feat());
11934     gene_feat->SetData().SetGene().SetLocus("locus value");
11935     gene_feat->SetLocation().SetMix().Set().push_back(first);
11936     gene_feat->SetLocation().SetMix().Set().push_back(middle);
11937     gene_feat->SetLocation().SetMix().Set().push_back(last);
11938     unit_test_util::AddFeat (gene_feat, nuc_seq);
11939     seh = scope.AddTopLevelSeqEntry(*entry);
11940     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "MultiIntervalGene",
11941         "Gene feature on non-segmented sequence should not have multiple intervals"));
11942     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSgeneRange",
11943         "gene [locus value:[lcl|nuc:1-6, ~, 8-11]] overlaps CDS but does not completely contain it"));
11944     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
11945                                                  "Coding region and protein feature partials conflict"));
11946     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblem",
11947         "Gene of 'order' with otherwise complete location should have partial flag set"));
11948     //AddChromosomeNoLocation(expected_errors, entry);
11949     eval = validator.Validate(seh, options);
11950     CheckErrors (*eval, expected_errors);
11951     CLEAR_ERRORS
11952 
11953     scope.RemoveTopLevelSeqEntry(seh);
11954     nuc_seq->SetSeq().ResetAnnot();
11955     cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
11956     cds->SetLocation().SetPartialStop(false, eExtreme_Biological);
11957     cds->SetPartial(true);
11958     unit_test_util::SetCompleteness (prot_seq, CMolInfo::eCompleteness_partial);
11959     seh = scope.AddTopLevelSeqEntry(*entry);
11960 
11961     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
11962                                                  "Coding region and protein feature partials conflict"));
11963     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblem",
11964         "5' or 3' partial location should not have unclassified partial in product molinfo descriptor"));
11965     //AddChromosomeNoLocation(expected_errors, entry);
11966     eval = validator.Validate(seh, options);
11967     CheckErrors (*eval, expected_errors);
11968     CLEAR_ERRORS
11969 
11970     scope.RemoveTopLevelSeqEntry(seh);
11971     entry = BuildGoodSpliceNucProtSet();
11972     misc_feat = unit_test_util::AddMiscFeature (entry->SetSet().SetSeq_set().front(), 15);
11973     misc_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
11974     misc_feat->SetPartial(true);
11975     seh = scope.AddTopLevelSeqEntry(*entry);
11976 
11977     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning,
11978                               "PartialProblem3Prime",
11979                               "Stop does not include first/last residue of sequence"));
11980     //AddChromosomeNoLocation(expected_errors, entry);
11981     eval = validator.Validate(seh, options);
11982     CheckErrors (*eval, expected_errors);
11983     CLEAR_ERRORS
11984 
11985     scope.RemoveTopLevelSeqEntry(seh);
11986     misc_feat->SetLocation().SetInt().SetFrom(46);
11987     misc_feat->SetLocation().SetInt().SetTo(56);
11988     misc_feat->SetLocation().SetPartialStart(true, eExtreme_Biological);
11989     misc_feat->SetLocation().SetPartialStop(false, eExtreme_Biological);
11990     seh = scope.AddTopLevelSeqEntry(*entry);
11991     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning,
11992                               "PartialProblem5Prime",
11993                               "Start does not include first/last residue of sequence"));
11994     //AddChromosomeNoLocation(expected_errors, entry);
11995     eval = validator.Validate(seh, options);
11996     CheckErrors (*eval, expected_errors);
11997     CLEAR_ERRORS
11998 
11999     scope.RemoveTopLevelSeqEntry(seh);
12000     // take misc_feat away
12001     entry->SetSet().SetSeq_set().front()->SetSeq().ResetAnnot();
12002     // cds, but splicing not expected
12003     // do not report, per V-763
12004     unit_test_util::SetDiv (entry, "BCT");
12005     entry->SetSet().ResetAnnot();
12006     cds.Reset(new CSeq_feat());
12007     cds->SetData().SetCdregion();
12008     cds->SetProduct().SetWhole().SetLocal().SetStr("prot");
12009     cds->SetLocation().SetInt().SetId().SetLocal().SetStr("nuc");
12010     cds->SetLocation().SetInt().SetFrom(0);
12011     cds->SetLocation().SetInt().SetTo(15);
12012     cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
12013     cds->SetPartial(true);
12014     unit_test_util::AddFeat (cds, entry->SetSet().SetSeq_set().front());
12015     prot_seq = entry->SetSet().SetSeq_set().back();
12016     prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKT");
12017     prot_seq->SetSeq().SetInst().SetLength(5);
12018     prot_seq->SetSeq().ResetAnnot();
12019     CRef<CSeq_feat> prot_feat = unit_test_util::AddProtFeat(prot_seq);
12020     prot_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
12021     prot_feat->SetPartial(true);
12022     unit_test_util::SetCompleteness (prot_seq, CMolInfo::eCompleteness_no_right);
12023     seh = scope.AddTopLevelSeqEntry(*entry);
12024     //AddChromosomeNoLocation(expected_errors, entry);
12025     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "PartialProblem3Prime",
12026         "Stop does not include first/last residue of sequence (but is at consensus splice site)"));
12027     eval = validator.Validate(seh, options);
12028     CheckErrors (*eval, expected_errors);
12029     CLEAR_ERRORS
12030 
12031     // splicing expected but on mRNA
12032     unit_test_util::SetDiv (entry, "PRI");
12033     entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
12034     unit_test_util::SetBiomol (entry->SetSet().SetSeq_set().front(), CMolInfo::eBiomol_mRNA);
12035     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemmRNASequence3Prime",
12036         "Stop does not include first/last residue of mRNA sequence"));
12037     eval = validator.Validate(seh, options);
12038     CheckErrors (*eval, expected_errors);
12039 
12040     CLEAR_ERRORS
12041 
12042     scope.RemoveTopLevelSeqEntry(seh);
12043     entry = unit_test_util::BuildGoodNucProtSet();
12044     cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12045     cds->SetLocation().SetInt().SetFrom(3);
12046     cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
12047     cds->SetPartial(true);
12048     nuc_seq = entry->SetSet().SetSeq_set().front();
12049     nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[2] = '#';
12050     prot_seq = entry->SetSet().SetSeq_set().back();
12051     prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("PRKTEIN");
12052     prot_seq->SetSeq().SetInst().SetLength(7);
12053     prot_feat = prot_seq->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
12054     prot_feat->SetLocation().SetInt().SetTo(6);
12055     unit_test_util::SetCompleteness (prot_seq, CMolInfo::eCompleteness_no_left);
12056     seh = scope.AddTopLevelSeqEntry(*entry);
12057     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue", "Invalid residue '#' at position [3]"));
12058     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
12059                                                  "Coding region and protein feature partials conflict"));
12060     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "PartialProblem",
12061         "PartialLocation: Start does not include first/last residue of sequence (and is at bad sequence)"));
12062     //AddChromosomeNoLocation(expected_errors, entry);
12063     eval = validator.Validate(seh, options);
12064     CheckErrors (*eval, expected_errors);
12065 
12066     scope.RemoveTopLevelSeqEntry(seh);
12067     entry = unit_test_util::BuildGoodNucProtSet();
12068     cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12069     cds->SetLocation().SetInt().SetTo(23);
12070     cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
12071     cds->SetPartial(true);
12072     nuc_seq = entry->SetSet().SetSeq_set().front();
12073     nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[24] = '#';
12074     prot_seq = entry->SetSet().SetSeq_set().back();
12075     unit_test_util::SetCompleteness (prot_seq, CMolInfo::eCompleteness_no_right);
12076     seh = scope.AddTopLevelSeqEntry(*entry);
12077     expected_errors[0]->SetErrMsg("Invalid residue '#' at position [25]");
12078     expected_errors[2]->SetErrMsg("PartialLocation: Stop does not include first/last residue of sequence (and is at bad sequence)");
12079     eval = validator.Validate(seh, options);
12080     CheckErrors (*eval, expected_errors);
12081 
12082     CLEAR_ERRORS
12083 
12084     scope.RemoveTopLevelSeqEntry(seh);
12085     entry = unit_test_util::BuildGoodNucProtSet();
12086     cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12087     cds->SetLocation().SetInt().SetFrom(3);
12088     cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
12089     cds->SetPartial(true);
12090     prot_seq = entry->SetSet().SetSeq_set().back();
12091     prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("PRKTEIN");
12092     prot_seq->SetSeq().SetInst().SetLength(7);
12093     prot_feat = prot_seq->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
12094     prot_feat->SetLocation().SetInt().SetTo(6);
12095     unit_test_util::SetCompleteness (prot_seq, CMolInfo::eCompleteness_no_left);
12096     seh = scope.AddTopLevelSeqEntry(*entry);
12097 
12098     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
12099                                                  "Coding region and protein feature partials conflict"));
12100     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus5Prime",
12101         "5' partial is not at beginning of sequence, gap, or consensus splice site"));
12102     //AddChromosomeNoLocation(expected_errors, entry);
12103     eval = validator.Validate(seh, options);
12104     CheckErrors (*eval, expected_errors);
12105 
12106     CLEAR_ERRORS
12107 
12108     scope.RemoveTopLevelSeqEntry(seh);
12109     entry = unit_test_util::BuildGoodNucProtSet();
12110     cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12111     cds->SetLocation().SetInt().SetTo(23);
12112     cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
12113     cds->SetPartial(true);
12114     prot_seq = entry->SetSet().SetSeq_set().back();
12115     unit_test_util::SetCompleteness (prot_seq, CMolInfo::eCompleteness_no_right);
12116     seh = scope.AddTopLevelSeqEntry(*entry);
12117 
12118     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
12119                                                  "Coding region and protein feature partials conflict"));
12120     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus3Prime",
12121         "3' partial is not at end of sequence, gap, or consensus splice site"));
12122     //AddChromosomeNoLocation(expected_errors, entry);
12123     eval = validator.Validate(seh, options);
12124     CheckErrors (*eval, expected_errors);
12125 
12126     CLEAR_ERRORS
12127 
12128     scope.RemoveTopLevelSeqEntry(seh);
12129     entry = unit_test_util::BuildGoodSeq();
12130     misc_feat = unit_test_util::AddMiscFeature (entry);
12131     misc_feat->SetLocation().SetInt().SetFrom(3);
12132     misc_feat->SetLocation().SetPartialStart(true, eExtreme_Biological);
12133     misc_feat->SetPartial(true);
12134     seh = scope.AddTopLevelSeqEntry(*entry);
12135     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PartialProblem5Prime",
12136         "Start does not include first/last residue of sequence"));
12137     //AddChromosomeNoLocation(expected_errors, entry);
12138     eval = validator.Validate(seh, options);
12139     CheckErrors (*eval, expected_errors);
12140 
12141     misc_feat->SetLocation().SetPartialStart(false, eExtreme_Biological);
12142     misc_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
12143     expected_errors[0]->SetErrCode("PartialProblem3Prime");
12144     expected_errors[0]->SetErrMsg("Stop does not include first/last residue of sequence");
12145     eval = validator.Validate(seh, options);
12146     CheckErrors (*eval, expected_errors);
12147 
12148     CLEAR_ERRORS
12149     misc_feat->SetLocation().Assign(*unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front()));
12150     misc_feat->SetLocation().SetMix().Set().front()->SetPartialStop(true, eExtreme_Biological);
12151     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "PartialProblem",
12152         "PartialLocation: Internal partial intervals do not include first/last residue of sequence"));
12153     //AddChromosomeNoLocation(expected_errors, entry);
12154     eval = validator.Validate(seh, options);
12155     CheckErrors (*eval, expected_errors);
12156 
12157     // suppress for RefSeq
12158     CLEAR_ERRORS
12159     scope.RemoveTopLevelSeqEntry(seh);
12160     CRef<CSeq_id> refseq_id(new CSeq_id());
12161     refseq_id->SetOther().SetAccession("NC_123456");
12162     entry->SetSeq().SetId().push_back(refseq_id);
12163     seh = scope.AddTopLevelSeqEntry(*entry);
12164     //AddChromosomeNoLocation(expected_errors, "ref|NC_123456|");
12165     eval = validator.Validate(seh, options);
12166     CheckErrors(*eval, expected_errors);
12167 
12168     CLEAR_ERRORS
12169 
12170     scope.RemoveTopLevelSeqEntry(seh);
12171     entry = unit_test_util::BuildGoodNucProtSet();
12172     cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12173     cds->SetPartial(true);
12174     prot_seq = entry->SetSet().SetSeq_set().back();
12175     prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("KPRKTEIN");
12176     seh = scope.AddTopLevelSeqEntry(*entry);
12177 
12178     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
12179         "Inconsistent: Product= complete, Location= complete, Feature.partial= TRUE"));
12180     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
12181         "Start of location should probably be partial"));
12182     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
12183         "This SeqFeat should not be partial"));
12184     //AddChromosomeNoLocation(expected_errors, entry);
12185 
12186     eval = validator.Validate(seh, options);
12187     CheckErrors (*eval, expected_errors);
12188 
12189     CLEAR_ERRORS
12190 
12191     scope.RemoveTopLevelSeqEntry(seh);
12192     entry = unit_test_util::BuildGoodNucProtSet();
12193     cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12194     cds->SetPartial(true);
12195     cds->SetLocation().SetInt().SetTo(23);
12196     seh = scope.AddTopLevelSeqEntry(*entry);
12197 
12198     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
12199         "Inconsistent: Product= complete, Location= complete, Feature.partial= TRUE"));
12200     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
12201         "End of location should probably be partial"));
12202     //AddChromosomeNoLocation(expected_errors, entry);
12203     eval = validator.Validate(seh, options);
12204     CheckErrors (*eval, expected_errors);
12205 
12206     scope.RemoveTopLevelSeqEntry(seh);
12207     entry = unit_test_util::BuildGoodNucProtSet();
12208     cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12209     cds->SetPartial(true);
12210     seh = scope.AddTopLevelSeqEntry(*entry);
12211 
12212     expected_errors[1]->SetErrMsg("This SeqFeat should not be partial");
12213     eval = validator.Validate(seh, options);
12214     CheckErrors (*eval, expected_errors);
12215 
12216     CLEAR_ERRORS
12217 
12218     cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
12219     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
12220                                                  "Coding region and protein feature partials conflict"));
12221     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus3Prime",
12222                                                  "3' partial is not at end of sequence, gap, or consensus splice site"));
12223     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
12224         "Inconsistent: Product= complete, Location= partial, Feature.partial= TRUE"));
12225     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
12226                                                  "Got stop codon, but 3'end is labeled partial"));
12227     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem",
12228                                                  "CDS is partial but protein is complete"));
12229     //AddChromosomeNoLocation(expected_errors, entry);
12230     eval = validator.Validate(seh, options);
12231     CheckErrors (*eval, expected_errors);
12232 
12233     CLEAR_ERRORS
12234 }
12235 
12236 
SetUpMiscForPartialTest(CSeq_feat & feat,TSeqPos start,TSeqPos stop,bool pseudo)12237 void SetUpMiscForPartialTest(CSeq_feat& feat, TSeqPos start, TSeqPos stop, bool pseudo)
12238 {
12239     feat.SetLocation().SetInt().SetFrom(start);
12240     feat.SetLocation().SetInt().SetTo(stop);
12241     if (pseudo) {
12242         feat.SetPseudo(true);
12243     } else {
12244         feat.ResetPseudo();
12245     }
12246 }
12247 
12248 
CheckMiscPartialErrors(CRef<CSeq_entry> entry,bool expect_bad_5,bool expect_bad_3)12249 void CheckMiscPartialErrors(CRef<CSeq_entry> entry, bool expect_bad_5, bool expect_bad_3)
12250 {
12251     STANDARD_SETUP
12252 
12253     eval = validator.Validate(seh, options);
12254     if (expect_bad_5) {
12255         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
12256             "PartialProblem5Prime",
12257             "Start does not include first/last residue of sequence"));
12258     }
12259     if (expect_bad_3) {
12260         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
12261             "PartialProblem3Prime",
12262             "Stop does not include first/last residue of sequence"));
12263     }
12264     if (entry->GetSeq().GetAnnot().front()->GetData().GetFtable().front()->GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
12265         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
12266             "CDSmRNAMismatchLocation", "No CDS location match for 1 mRNA"));
12267     }
12268     //AddChromosomeNoLocation(expected_errors, entry);
12269     CheckErrors(*eval, expected_errors);
12270     CLEAR_ERRORS
12271 }
12272 
12273 
TestOneMiscPartial(CRef<CSeq_entry> entry,TSeqPos good_start,TSeqPos bad_start,TSeqPos good_stop,TSeqPos bad_stop,bool is_mrna)12274 void TestOneMiscPartial(CRef<CSeq_entry> entry, TSeqPos good_start, TSeqPos bad_start, TSeqPos good_stop, TSeqPos bad_stop, bool is_mrna)
12275 {
12276     entry->SetSeq().ResetAnnot();
12277     CRef<CSeq_feat> misc = AddMiscFeature(entry);
12278     if (is_mrna) {
12279         misc->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
12280         misc->SetData().SetRna().SetExt().SetName("fake mRNA name");
12281     }
12282     misc->SetLocation().SetPartialStart(true, eExtreme_Biological);
12283     misc->SetLocation().SetPartialStop(true, eExtreme_Biological);
12284     misc->SetPartial(true);
12285 
12286     SetUpMiscForPartialTest(*misc, good_start, good_stop, false);
12287     CheckMiscPartialErrors(entry, false, false);
12288 
12289     SetUpMiscForPartialTest(*misc, good_start, good_stop, true);
12290     CheckMiscPartialErrors(entry, false, false);
12291 
12292     SetUpMiscForPartialTest(*misc, bad_start, good_stop, false);
12293     CheckMiscPartialErrors(entry, true, false);
12294 
12295     SetUpMiscForPartialTest(*misc, bad_start, good_stop, true);
12296     CheckMiscPartialErrors(entry, false, false);
12297 
12298     SetUpMiscForPartialTest(*misc, good_start, bad_stop, false);
12299     CheckMiscPartialErrors(entry, false, true);
12300 
12301     SetUpMiscForPartialTest(*misc, good_start, bad_stop, true);
12302     CheckMiscPartialErrors(entry, false, false);
12303 
12304     SetUpMiscForPartialTest(*misc, bad_start, bad_stop, false);
12305     CheckMiscPartialErrors(entry, true, true);
12306 
12307     SetUpMiscForPartialTest(*misc, bad_start, bad_stop, true);
12308     CheckMiscPartialErrors(entry, false, false);
12309 }
12310 
12311 
BOOST_AUTO_TEST_CASE(Test_VR_763)12312 BOOST_AUTO_TEST_CASE(Test_VR_763)
12313 {
12314     CRef<CSeq_entry> entry = BuildGoodSeq();
12315 
12316     // ends
12317     TestOneMiscPartial(entry, 0, 1, entry->GetSeq().GetLength() - 1, entry->GetSeq().GetLength() - 2, false);
12318 #if 0
12319     TestOneMiscPartial(entry, 0, 1, entry->GetSeq().GetLength() - 1, entry->GetSeq().GetLength() - 2, true);
12320 
12321     // gap
12322     entry->SetSeq().SetInst().ResetSeq_data();
12323     entry->SetSeq().SetInst().SetRepr(objects::CSeq_inst::eRepr_delta);
12324     entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("ATGATGATGCCCAAATTTGGGAAAA", objects::CSeq_inst::eMol_dna);
12325     CRef<objects::CDelta_seq> gap1(new objects::CDelta_seq());
12326     gap1->SetLiteral().SetSeq_data().SetGap();
12327     gap1->SetLiteral().SetLength(10);
12328     entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap1);
12329     entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCATGATGATGAAATTTGGGCCCC", objects::CSeq_inst::eMol_dna);
12330     CRef<objects::CDelta_seq> gap2(new objects::CDelta_seq());
12331     gap2->SetLiteral().SetSeq_data().SetGap();
12332     gap2->SetLiteral().SetLength(10);
12333     entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap2);
12334     entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("AAACCCATGATGATGCCAATTCCCG", objects::CSeq_inst::eMol_dna);
12335     entry->SetSeq().SetInst().SetLength(95);
12336     TestOneMiscPartial(entry, 36, 37, 58, 57, false);
12337     TestOneMiscPartial(entry, 36, 37, 58, 57, true);
12338 
12339     // splice
12340     entry = BuildGoodSeq();
12341     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AGTTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCGT");
12342     TestOneMiscPartial(entry, 0, 2, 59, 57, false);
12343     TestOneMiscPartial(entry, 2, 3, 57, 56, true);
12344 
12345 #endif
12346 }
12347 
12348 
BOOST_AUTO_TEST_CASE(Test_FEAT_InvalidType)12349 BOOST_AUTO_TEST_CASE(Test_FEAT_InvalidType)
12350 {
12351     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
12352     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature (entry);
12353     misc->SetData().Reset();
12354 
12355     STANDARD_SETUP
12356 
12357     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidType",
12358                                                  "Invalid SeqFeat type [0]"));
12359     //AddChromosomeNoLocation(expected_errors, entry);
12360     eval = validator.Validate(seh, options);
12361     CheckErrors (*eval, expected_errors);
12362     CLEAR_ERRORS
12363 }
12364 
12365 
BOOST_AUTO_TEST_CASE(Test_FEAT_Range)12366 BOOST_AUTO_TEST_CASE(Test_FEAT_Range)
12367 {
12368     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
12369     CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
12370     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(14);
12371     unit_test_util::AddFeat (trna, entry);
12372 
12373     STANDARD_SETUP
12374 
12375     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "tRNArange",
12376                                                  "Anticodon is not 3 bases in length"));
12377     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "tRNArange",
12378                                                  "Anticodon location not in tRNA"));
12379     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "tRNArange",
12380         "Anticodon location [lcl|good:15-14] out of range"));
12381     //AddChromosomeNoLocation(expected_errors, entry);
12382     eval = validator.Validate(seh, options);
12383     CheckErrors (*eval, expected_errors);
12384     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(100);
12385     expected_errors[2]->SetErrMsg("Anticodon location [lcl|good:15-101] out of range");
12386     eval = validator.Validate(seh, options);
12387     CheckErrors (*eval, expected_errors);
12388     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(50);
12389     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(kInvalidSeqPos);
12390     expected_errors[2]->SetErrMsg("Anticodon location [lcl|good:0-51] out of range");
12391     eval = validator.Validate(seh, options);
12392     CheckErrors (*eval, expected_errors);
12393 
12394     CLEAR_ERRORS
12395     scope.RemoveTopLevelSeqEntry(seh);
12396     entry = unit_test_util::BuildGoodNucProtSet();
12397     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12398 
12399     CRef<CCode_break> codebreak(new CCode_break());
12400     codebreak->SetLoc().SetInt().SetId().SetLocal().SetStr("nuc");
12401     codebreak->SetLoc().SetInt().SetFrom(27);
12402     codebreak->SetLoc().SetInt().SetTo(29);
12403     cds->SetData().SetCdregion().SetCode_break().push_back(codebreak);
12404     seh = scope.AddTopLevelSeqEntry(*entry);
12405 
12406     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "CDSrange",
12407                                                  "Code-break location not in coding region"));
12408     //AddChromosomeNoLocation(expected_errors, entry);
12409     eval = validator.Validate(seh, options);
12410     CheckErrors (*eval, expected_errors);
12411 
12412     CLEAR_ERRORS
12413 
12414     codebreak->SetLoc().SetInt().SetFrom(0);
12415     codebreak->SetLoc().SetInt().SetTo(1);
12416     cds->SetData().SetCdregion().SetFrame(CCdregion::eFrame_three);
12417     CRef<CSeq_entry> nentry = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
12418     cds->SetLocation().SetInt().SetTo(nentry->GetSeq().GetInst().GetLength() - 1);
12419     unit_test_util::SetNucProtSetPartials (entry, true, true);
12420     unit_test_util::RetranslateCdsForNucProtSet (entry, scope);
12421     scope.RemoveTopLevelSeqEntry(seh);
12422     seh = scope.AddTopLevelSeqEntry(*entry);
12423 
12424     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "CDSrange",
12425                                                  "Code-break location not in coding region - may be frame problem"));
12426     //AddChromosomeNoLocation(expected_errors, entry);
12427     SetDiagFilter(eDiagFilter_All, "!(1210.8)");
12428     eval = validator.Validate(seh, options);
12429     SetDiagFilter(eDiagFilter_All, "");
12430     CheckErrors (*eval, expected_errors);
12431 
12432     CLEAR_ERRORS
12433 
12434     scope.RemoveTopLevelSeqEntry(seh);
12435     entry = unit_test_util::BuildGoodSeq();
12436     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature (entry);
12437     misc->SetData().SetRna().SetType(CRNA_ref::eType_tRNA);
12438     misc->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('N');
12439     misc->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetId().SetLocal().SetStr("good");
12440     misc->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(11);
12441     misc->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(13);
12442     seh = scope.AddTopLevelSeqEntry(*entry);
12443     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "tRNArange",
12444                                                  "Anticodon location not in tRNA"));
12445     //AddChromosomeNoLocation(expected_errors, entry);
12446     eval = validator.Validate(seh, options);
12447     CheckErrors (*eval, expected_errors);
12448 
12449     misc->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(6);
12450     misc->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(10);
12451     expected_errors[0]->SetSeverity(eDiag_Warning);
12452     expected_errors[0]->SetErrMsg("Anticodon is not 3 bases in length");
12453     eval = validator.Validate(seh, options);
12454     CheckErrors (*eval, expected_errors);
12455 
12456     scope.RemoveTopLevelSeqEntry(seh);
12457     entry = unit_test_util::BuildGoodSeq();
12458     misc = unit_test_util::AddMiscFeature (entry);
12459     misc->SetLocation().SetInt().SetFrom(11);
12460     SetDiagFilter(eDiagFilter_All, "!(1204.1)");
12461     seh = scope.AddTopLevelSeqEntry(*entry);
12462     SetDiagFilter(eDiagFilter_All, "");
12463     expected_errors[0]->SetErrCode("Range");
12464     expected_errors[0]->SetSeverity(eDiag_Critical);
12465     expected_errors[0]->SetErrMsg("Location: SeqLoc [lcl|good:12-11] out of range");
12466     eval = validator.Validate(seh, options);
12467     CheckErrors (*eval, expected_errors);
12468     scope.RemoveTopLevelSeqEntry(seh);
12469     entry = unit_test_util::BuildGoodSeq();
12470     misc = unit_test_util::AddMiscFeature (entry);
12471     misc->SetLocation().SetInt().SetTo(100);
12472     seh = scope.AddTopLevelSeqEntry(*entry);
12473     expected_errors[0]->SetErrMsg("Location: SeqLoc [lcl|good:1-101] out of range");
12474     eval = validator.Validate(seh, options);
12475     CheckErrors (*eval, expected_errors);
12476 
12477     CLEAR_ERRORS
12478 }
12479 
12480 
BOOST_AUTO_TEST_CASE(Test_tRNA_Mixed_Loc)12481 BOOST_AUTO_TEST_CASE(Test_tRNA_Mixed_Loc) // Jira: VR_133
12482 {
12483     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
12484     CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front()); // N(Asn)
12485     CRef<CSeq_loc> anticodon_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
12486     anticodon_loc->SetMix().Set().front()->SetInt().SetFrom(0); // A
12487     anticodon_loc->SetMix().Set().front()->SetInt().SetTo(0);
12488     anticodon_loc->SetMix().Set().front()->SetInt().SetStrand(eNa_strand_plus);
12489     anticodon_loc->SetMix().Set().back()->SetInt().SetFrom(2); // TT
12490     anticodon_loc->SetMix().Set().back()->SetInt().SetTo(3);
12491     anticodon_loc->SetMix().Set().back()->SetInt().SetStrand(eNa_strand_plus);
12492     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().Assign(*anticodon_loc);
12493     unit_test_util::AddFeat (trna, entry);
12494 
12495     STANDARD_SETUP
12496 
12497     eval = validator.Validate(seh, options);
12498     //AddChromosomeNoLocation(expected_errors, entry);
12499     CheckErrors (*eval, expected_errors);
12500 
12501     CLEAR_ERRORS
12502 }
12503 
12504 
BOOST_AUTO_TEST_CASE(Test_FEAT_MixedStrand)12505 BOOST_AUTO_TEST_CASE(Test_FEAT_MixedStrand)
12506 {
12507     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
12508     CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
12509     CRef<CSeq_loc> anticodon_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
12510     anticodon_loc->SetMix().Set().front()->SetInt().SetFrom(0);
12511     anticodon_loc->SetMix().Set().front()->SetInt().SetTo(0);
12512     anticodon_loc->SetMix().Set().front()->SetInt().SetStrand(eNa_strand_minus);
12513     anticodon_loc->SetMix().Set().back()->SetInt().SetFrom(9);
12514     anticodon_loc->SetMix().Set().back()->SetInt().SetTo(10);
12515     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().Assign(*anticodon_loc);
12516     unit_test_util::AddFeat (trna, entry);
12517 
12518     STANDARD_SETUP
12519 
12520     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "AnticodonMixedStrand",
12521                                                  "Mixed strands in Anticodon [[lcl|good:c1-1, 10-11]]"));
12522     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadAnticodonAA",
12523                                                  "Codons predicted from anticodon (UAA) cannot produce amino acid (N/Asn)"));
12524     //AddChromosomeNoLocation(expected_errors, entry);
12525     eval = validator.Validate(seh, options);
12526     CheckErrors (*eval, expected_errors);
12527 
12528     scope.RemoveTopLevelSeqEntry(seh);
12529     entry = unit_test_util::BuildGoodSeq();
12530     trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
12531     anticodon_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
12532     anticodon_loc->SetMix().Set().front()->SetInt().SetFrom(0);
12533     anticodon_loc->SetMix().Set().front()->SetInt().SetTo(0);
12534     anticodon_loc->SetMix().Set().front()->SetInt().SetStrand(eNa_strand_plus);
12535     anticodon_loc->SetMix().Set().back()->SetInt().SetFrom(9);
12536     anticodon_loc->SetMix().Set().back()->SetInt().SetTo(10);
12537     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().Assign(*anticodon_loc);
12538     unit_test_util::AddFeat (trna, entry);
12539     seh = scope.AddTopLevelSeqEntry(*entry);
12540 
12541     expected_errors[0]->SetErrCode("AnticodonMixedStrand");
12542     expected_errors[0]->SetErrMsg("Mixed plus and unknown strands in Anticodon [[lcl|good:1-1, 10-11]]");
12543     expected_errors[1]->SetErrMsg("Codons predicted from anticodon (AAA) cannot produce amino acid (N/Asn)");
12544     eval = validator.Validate(seh, options);
12545     CheckErrors (*eval, expected_errors);
12546 
12547     CLEAR_ERRORS
12548 
12549     scope.RemoveTopLevelSeqEntry(seh);
12550     entry = unit_test_util::BuildGoodSeq();
12551     CRef<CSeq_feat> gene = AddMiscFeature(entry);
12552     CRef<CSeq_loc> gene_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
12553     gene_loc->SetMix().Set().front()->SetInt().SetFrom(0);
12554     gene_loc->SetMix().Set().front()->SetInt().SetTo(0);
12555     gene_loc->SetMix().Set().front()->SetInt().SetStrand(eNa_strand_minus);
12556     gene_loc->SetMix().Set().back()->SetInt().SetFrom(9);
12557     gene_loc->SetMix().Set().back()->SetInt().SetTo(10);
12558     gene->SetLocation().Assign(*gene_loc);
12559     seh = scope.AddTopLevelSeqEntry(*entry);
12560     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MixedStrand",
12561         "Location: Mixed strands in SeqLoc [(lcl|good:c1-1, 10-11)]"));
12562     //AddChromosomeNoLocation(expected_errors, entry);
12563     eval = validator.Validate(seh, options);
12564     CheckErrors(*eval, expected_errors);
12565 
12566     // warning if gene is pseudo
12567     scope.RemoveTopLevelSeqEntry(seh);
12568     gene->SetPseudo(true);
12569     seh = scope.AddTopLevelSeqEntry(*entry);
12570     expected_errors[0]->SetSeverity(eDiag_Warning);
12571     eval = validator.Validate(seh, options);
12572     CheckErrors(*eval, expected_errors);
12573     CLEAR_ERRORS
12574 }
12575 
12576 
BOOST_AUTO_TEST_CASE(Test_FEAT_SeqLocOrder)12577 BOOST_AUTO_TEST_CASE(Test_FEAT_SeqLocOrder)
12578 {
12579     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
12580     CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
12581     CRef<CSeq_loc> anticodon_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
12582     anticodon_loc->SetMix().Set().front()->SetInt().SetFrom(9);
12583     anticodon_loc->SetMix().Set().front()->SetInt().SetTo(10);
12584     anticodon_loc->SetMix().Set().back()->SetInt().SetFrom(0);
12585     anticodon_loc->SetMix().Set().back()->SetInt().SetTo(0);
12586     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().Assign(*anticodon_loc);
12587     unit_test_util::AddFeat (trna, entry);
12588 
12589     STANDARD_SETUP
12590 
12591     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "SeqLocOrder",
12592                                                  "Intervals out of order in Anticodon [[lcl|good:10-11, 1-1]]"));
12593     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadAnticodonAA",
12594                                                  "Codons predicted from anticodon (AAA) cannot produce amino acid (N/Asn)"));
12595     //AddChromosomeNoLocation(expected_errors, entry);
12596     eval = validator.Validate(seh, options);
12597     CheckErrors (*eval, expected_errors);
12598 
12599     CLEAR_ERRORS
12600 
12601     scope.RemoveTopLevelSeqEntry(seh);
12602     entry = unit_test_util::BuildGoodSeq();
12603     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature (entry);
12604     misc->SetLocation().Assign(*anticodon_loc);
12605     seh = scope.AddTopLevelSeqEntry(*entry);
12606     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "SeqLocOrder",
12607                                                  "Location: Intervals out of order in SeqLoc [(lcl|good:10-11, 1-1)]"));
12608     //AddChromosomeNoLocation(expected_errors, entry);
12609     eval = validator.Validate(seh, options);
12610     CheckErrors (*eval, expected_errors);
12611 
12612     CLEAR_ERRORS
12613 }
12614 
12615 
BOOST_AUTO_TEST_CASE(Test_FEAT_CdTransFail)12616 BOOST_AUTO_TEST_CASE(Test_FEAT_CdTransFail)
12617 {
12618     SetDiagFilter(eDiagFilter_All, "!(1204.1)");
12619     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
12620     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12621     cds->SetLocation().SetInt().SetFrom(27);
12622     STANDARD_SETUP
12623 
12624     BOOST_CHECK_EQUAL(validator::HasNoStop(*cds, &scope), true);
12625 
12626     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "Range",
12627                                                  "Location: SeqLoc [lcl|nuc:28-27] out of range"));
12628     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "ProductLength",
12629                                                  "Protein product length [8] is more than 120% of the translation length [0]"));
12630     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen",
12631                                                  "Given protein length [8] does not match translation length [0]"));
12632     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoStop",
12633                                                  "Missing stop codon"));
12634     //AddChromosomeNoLocation(expected_errors, entry);
12635     eval = validator.Validate(seh, options);
12636     CheckErrors (*eval, expected_errors);
12637 
12638     CLEAR_ERRORS
12639     SetDiagFilter(eDiagFilter_All, "");
12640 }
12641 
12642 
12643 #define START_CODON_AND_INT_STOP_ERR \
12644 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StartCodon",\
12645                           "Illegal start codon (and 1 internal stops). Probably wrong genetic code [0]"));
12646 #define INTERNAL_STOP_ERR \
12647 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "InternalStop",\
12648                               "1 internal stops (and illegal start codon). Genetic code [0]"));
12649 #define NO_STOP_ERR \
12650 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoStop", "Missing stop codon"));
12651 #define NO_PUB_ERR \
12652 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoPubFound", "No publications anywhere on this entire record."));
12653 #define PROT_LEN_ERR \
12654 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen",\
12655                                              "Given protein length [8] does not match translation length [9]"));
12656 #define NO_SUB_ERR \
12657 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "MissingPubRequirement", "No submission citation anywhere on this entire record."));
12658 #define EXCEPTION_PROBLEM_ERR \
12659 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "ExceptionProblem", "unclassified translation discrepancy is not a legal exception explanation"));
12660 #define NO_SRC_ERR \
12661 expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoSourceDescriptor", "No source information included on this record."));
12662 
12663 
BOOST_AUTO_TEST_CASE(Test_FEAT_StartCodon)12664 BOOST_AUTO_TEST_CASE(Test_FEAT_StartCodon)
12665 {
12666     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
12667     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12668     cds->SetLocation().SetInt().SetFrom(1);
12669     cds->SetLocation().SetInt().SetTo(27);
12670 
12671     CRef<CSeq_entry> nuc(new CSeq_entry());
12672     nuc->Assign(*unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry));
12673     CRef<CSeq_feat> nuc_only_cds(new CSeq_feat());
12674     nuc_only_cds->Assign(*cds);
12675     unit_test_util::AddFeat(nuc_only_cds, nuc);
12676 
12677     STANDARD_SETUP
12678 
12679     BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
12680     BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
12681     BOOST_CHECK_EQUAL(validator::HasNoStop(*cds, &scope), true);
12682 
12683     START_CODON_AND_INT_STOP_ERR
12684     INTERNAL_STOP_ERR
12685     NO_STOP_ERR
12686     PROT_LEN_ERR
12687     //AddChromosomeNoLocation(expected_errors, entry);
12688 
12689     eval = validator.Validate(seh, options);
12690     CheckErrors (*eval, expected_errors);
12691     CLEAR_ERRORS
12692 
12693     scope.RemoveTopLevelSeqEntry(seh);
12694     seh = scope.AddTopLevelSeqEntry(*nuc);
12695     eval = validator.Validate(seh, options);
12696 
12697     START_CODON_AND_INT_STOP_ERR
12698     INTERNAL_STOP_ERR
12699     NO_STOP_ERR
12700     NO_PUB_ERR
12701     NO_SUB_ERR
12702     NO_SRC_ERR
12703     CheckErrors(*eval, expected_errors);
12704     CLEAR_ERRORS
12705 
12706 
12707     scope.RemoveTopLevelSeqEntry(seh);
12708     seh = scope.AddTopLevelSeqEntry(*entry);
12709 
12710     // don't report start codon if unclassified exception
12711     cds->SetExcept(true);
12712     cds->SetExcept_text("unclassified translation discrepancy");
12713 
12714     INTERNAL_STOP_ERR
12715     EXCEPTION_PROBLEM_ERR
12716     //AddChromosomeNoLocation(expected_errors, entry);
12717 
12718     expected_errors[0]->SetSeverity(eDiag_Warning);
12719     eval = validator.Validate(seh, options);
12720     CheckErrors (*eval, expected_errors);
12721     CLEAR_ERRORS
12722 
12723     scope.RemoveTopLevelSeqEntry(seh);
12724     nuc_only_cds->Assign(*cds);
12725     seh = scope.AddTopLevelSeqEntry(*nuc);
12726     eval = validator.Validate(seh, options);
12727     EXCEPTION_PROBLEM_ERR
12728     INTERNAL_STOP_ERR
12729     NO_PUB_ERR
12730     NO_SUB_ERR
12731     NO_SRC_ERR
12732     expected_errors[1]->SetSeverity(eDiag_Warning);
12733     CheckErrors(*eval, expected_errors);
12734     CLEAR_ERRORS
12735 
12736     scope.RemoveTopLevelSeqEntry(seh);
12737     entry = unit_test_util::BuildGoodNucProtSet();
12738     cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12739     cds->SetExcept(false);
12740     cds->ResetExcept_text();
12741     CRef<CSeq_entry> nuc_seq = entry->SetSet().SetSeq_set().front();
12742     nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[0] = 'C';
12743     nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[1] = 'C';
12744     seh = scope.AddTopLevelSeqEntry(*entry);
12745     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StartCodon",
12746                               "Illegal start codon used. Wrong genetic code [0] or protein should be partial"));
12747     //AddChromosomeNoLocation(expected_errors, entry);
12748     eval = validator.Validate(seh, options);
12749     CheckErrors (*eval, expected_errors);
12750     CLEAR_ERRORS
12751 
12752     // don't report start codon if unclassified exception
12753     cds->SetExcept(true);
12754     cds->SetExcept_text("unclassified translation discrepancy");
12755     EXCEPTION_PROBLEM_ERR
12756     //AddChromosomeNoLocation(expected_errors, entry);
12757 
12758     eval = validator.Validate(seh, options);
12759     CheckErrors (*eval, expected_errors);
12760     CLEAR_ERRORS
12761 
12762 }
12763 
12764 
BOOST_AUTO_TEST_CASE(Test_FEAT_InternalStop)12765 BOOST_AUTO_TEST_CASE(Test_FEAT_InternalStop)
12766 {
12767     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
12768     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12769     cds->SetLocation().SetInt().SetFrom(1);
12770     cds->SetLocation().SetInt().SetTo(27);
12771 
12772     STANDARD_SETUP
12773 
12774     BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
12775     BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
12776     BOOST_CHECK_EQUAL(validator::HasNoStop(*cds, &scope), true);
12777 
12778     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StartCodon",
12779                               "Illegal start codon (and 1 internal stops). Probably wrong genetic code [0]"));
12780     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "InternalStop",
12781                               "1 internal stops (and illegal start codon). Genetic code [0]"));
12782     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen",
12783                                                  "Given protein length [8] does not match translation length [9]"));
12784     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoStop",
12785                                                  "Missing stop codon"));
12786     //AddChromosomeNoLocation(expected_errors, entry);
12787 
12788     eval = validator.Validate(seh, options);
12789     CheckErrors (*eval, expected_errors);
12790 
12791     CLEAR_ERRORS
12792 
12793     scope.RemoveTopLevelSeqEntry(seh);
12794     entry = unit_test_util::BuildGoodNucProtSet();
12795     cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12796     CRef<CSeq_entry> nuc_seq = entry->SetSet().SetSeq_set().front();
12797     nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[9] = 'T';
12798     entry->SetSet().SetSeq_set().back()->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPR*TEIN");
12799     seh = scope.AddTopLevelSeqEntry(*entry);
12800 
12801     BOOST_CHECK_EQUAL(validator::HasStopInProtein(*cds, scope), true);
12802     BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
12803 
12804     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein",
12805                               "[1] termination symbols in protein sequence (gene? - fake protein name)"));
12806     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "InternalStop",
12807                               "1 internal stops. Genetic code [0]"));
12808     //AddChromosomeNoLocation(expected_errors, entry);
12809     eval = validator.Validate(seh, options);
12810     CheckErrors (*eval, expected_errors);
12811 
12812     CValidErrorFormat format(*objmgr);
12813     string rval = format.FormatForSubmitterReport(*eval, scope, eErr_SEQ_FEAT_InternalStop);
12814     BOOST_CHECK_EQUAL(rval, "InternalStop\nlcl|nuc:CDS\t fake protein name\tlcl|nuc:1-27\t\n");
12815 
12816     // try again with locus tag for report
12817     scope.RemoveTopLevelSeqEntry(seh);
12818     CRef<CSeq_feat> gene = MakeGeneForFeature(cds);
12819     gene->SetData().SetGene().SetLocus_tag("a_locus_tag");
12820     AddFeat(gene, nuc_seq);
12821     seh = scope.AddTopLevelSeqEntry(*entry);
12822     eval = validator.Validate(seh, options);
12823     rval = format.FormatForSubmitterReport(*eval, scope, eErr_SEQ_FEAT_InternalStop);
12824     BOOST_CHECK_EQUAL(rval, "InternalStop\nlcl|nuc:CDS\t fake protein name\tlcl|nuc:1-27\ta_locus_tag\n");
12825 
12826     CLEAR_ERRORS
12827 }
12828 
12829 
BOOST_AUTO_TEST_CASE(Test_FEAT_NoProtein)12830 BOOST_AUTO_TEST_CASE(Test_FEAT_NoProtein)
12831 {
12832     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
12833     entry->SetSet().SetSeq_set().pop_back();
12834     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12835     cds->ResetProduct();
12836     STANDARD_SETUP
12837 
12838     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NucProtProblem",
12839                               "No proteins in nuc-prot set"));
12840     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoProtein",
12841                               "No protein Bioseq given"));
12842     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "MissingCDSproduct",
12843                               "Expected CDS product absent"));
12844     //AddChromosomeNoLocation(expected_errors, entry);
12845 
12846     options |= CValidator::eVal_far_fetch_cds_products;
12847     eval = validator.Validate(seh, options);
12848     CheckErrors (*eval, expected_errors);
12849 
12850     CLEAR_ERRORS
12851 
12852 }
12853 
12854 
BOOST_AUTO_TEST_CASE(Test_FEAT_MisMatchAA)12855 BOOST_AUTO_TEST_CASE(Test_FEAT_MisMatchAA)
12856 {
12857     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
12858     unit_test_util::MakeNucProtSet3Partial(entry);
12859     CRef<CSeq_entry> prot = entry->SetSet().SetSeq_set().back();
12860     prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set()[0] = 'A';
12861 
12862     STANDARD_SETUP
12863 
12864     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "MisMatchAA",
12865     "Residue 1 in protein [A] != translation [M] at lcl|nuc:1-3"));
12866     //AddChromosomeNoLocation(expected_errors, entry);
12867 
12868     eval = validator.Validate(seh, options);
12869     CheckErrors (*eval, expected_errors);
12870 
12871     for (int i = 0; i < 11; i++) {
12872       prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set()[i] = 'A';
12873     }
12874 
12875 
12876     expected_errors[0]->SetErrMsg("11 mismatches found.  First mismatch at 1, residue in protein [A] != translation [M] at lcl|nuc:1-3.  Last mismatch at 11, residue in protein [A] != translation [M] at lcl|nuc:31-33.  Genetic code [0]");
12877 
12878     eval = validator.Validate(seh, options);
12879     CheckErrors (*eval, expected_errors);
12880 
12881     CLEAR_ERRORS
12882 
12883 }
12884 
12885 
BOOST_AUTO_TEST_CASE(Test_FEAT_TransLen)12886 BOOST_AUTO_TEST_CASE(Test_FEAT_TransLen)
12887 {
12888     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
12889     CRef<CSeq_entry> prot_seq = entry->SetSet().SetSeq_set().back();
12890     prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKTEI");
12891     prot_seq->SetSeq().SetInst().SetLength(7);
12892     unit_test_util::AdjustProtFeatForNucProtSet (entry);
12893 
12894     STANDARD_SETUP
12895 
12896     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen",
12897                                                  "Given protein length [7] does not match translation length [9]"));
12898     //AddChromosomeNoLocation(expected_errors, entry);
12899 
12900     eval = validator.Validate(seh, options);
12901     CheckErrors (*eval, expected_errors);
12902 
12903     CLEAR_ERRORS
12904 
12905     scope.RemoveTopLevelSeqEntry(seh);
12906     entry = unit_test_util::BuildGoodNucProtSet();
12907     CRef<CSeq_entry> nuc_seq = entry->SetSet().SetSeq_set().front();
12908     nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[27] = 'A';
12909     nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[28] = 'T';
12910     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12911     cds->SetLocation().SetInt().SetTo(28);
12912     seh = scope.AddTopLevelSeqEntry(*entry);
12913     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen",
12914                               "Coding region extends 2 base(s) past stop codon"));
12915     //AddChromosomeNoLocation(expected_errors, entry);
12916 
12917     eval = validator.Validate(seh, options);
12918     CheckErrors (*eval, expected_errors);
12919 
12920     CLEAR_ERRORS
12921 
12922     scope.RemoveTopLevelSeqEntry(seh);
12923     entry = unit_test_util::BuildGoodNucProtSet();
12924     cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12925     prot_seq = entry->SetSet().SetSeq_set().back();
12926     prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKTEINQQLLLLLLLLLLQQQQQQQQQQ");
12927     prot_seq->SetSeq().SetInst().SetLength(30);
12928     unit_test_util::AdjustProtFeatForNucProtSet (entry);
12929     seh = scope.AddTopLevelSeqEntry(*entry);
12930     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "ProductLength",
12931                               "Protein product length [30] is more than 120% of the translation length [9]"));
12932     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen",
12933         "Given protein length [30] does not match translation length [9]"));
12934     //AddChromosomeNoLocation(expected_errors, entry);
12935 
12936     eval = validator.Validate(seh, options);
12937     CheckErrors (*eval, expected_errors);
12938 
12939     CLEAR_ERRORS
12940 
12941     // setting this exception suppresses the error
12942     cds->SetExcept(true);
12943     cds->SetExcept_text("annotated by transcript or proteomic data");
12944     // inference is required for exception
12945     cds->AddQualifier("inference", "similar to DNA sequence:INSD:AY123456.1");
12946     //AddChromosomeNoLocation(expected_errors, entry);
12947     eval = validator.Validate(seh, options);
12948     CheckErrors(*eval, expected_errors);
12949 
12950     CLEAR_ERRORS
12951 }
12952 
12953 
BOOST_AUTO_TEST_CASE(Test_FEAT_NoStop)12954 BOOST_AUTO_TEST_CASE(Test_FEAT_NoStop)
12955 {
12956     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
12957     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12958     cds->SetLocation().SetInt().SetTo(23);
12959 
12960     STANDARD_SETUP
12961 
12962     BOOST_CHECK_EQUAL(validator::HasNoStop(*cds, &scope), true);
12963 
12964     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoStop",
12965                                                  "Missing stop codon"));
12966     //AddChromosomeNoLocation(expected_errors, entry);
12967 
12968     eval = validator.Validate(seh, options);
12969     CheckErrors (*eval, expected_errors);
12970 
12971     CLEAR_ERRORS
12972 }
12973 
12974 
BOOST_AUTO_TEST_CASE(Test_FEAT_TranslExcept)12975 BOOST_AUTO_TEST_CASE(Test_FEAT_TranslExcept)
12976 {
12977     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
12978     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12979     cds->AddQualifier("transl_except", "abc");
12980     CRef<CSeq_entry> prot_seq = entry->SetSet().SetSeq_set().back();
12981     prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set()[4] = 'E';
12982 
12983     STANDARD_SETUP
12984 
12985     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "MisMatchAA",
12986     "Residue 5 in protein [E] != translation [T] at lcl|nuc:13-15"));
12987     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranslExcept",
12988                                                  "Unparsed transl_except qual. Skipped"));
12989     //AddChromosomeNoLocation(expected_errors, entry);
12990 
12991     eval = validator.Validate(seh, options);
12992     CheckErrors (*eval, expected_errors);
12993 
12994     CLEAR_ERRORS
12995     scope.RemoveTopLevelSeqEntry(seh);
12996     entry = unit_test_util::BuildGoodNucProtSet();
12997     cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
12998     cds->AddQualifier("transl_except", "abc");
12999     seh = scope.AddTopLevelSeqEntry(*entry);
13000 
13001     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranslExcept",
13002                                                  "Unparsed transl_except qual (but protein is okay). Skipped"));
13003     //AddChromosomeNoLocation(expected_errors, entry);
13004 
13005     eval = validator.Validate(seh, options);
13006     CheckErrors (*eval, expected_errors);
13007 
13008     CLEAR_ERRORS
13009 
13010 }
13011 
13012 
BOOST_AUTO_TEST_CASE(Test_FEAT_NoProtRefFound)13013 BOOST_AUTO_TEST_CASE(Test_FEAT_NoProtRefFound)
13014 {
13015     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
13016     CRef<CSeq_entry> prot_seq = entry->SetSet().SetSeq_set().back();
13017     CRef<CSeq_feat> prot_feat = prot_seq->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
13018     prot_feat->SetLocation().SetInt().SetTo (6);
13019 
13020     STANDARD_SETUP
13021 
13022     // see this error if prot-ref present, but wrong size, or if absent completely
13023     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "MissingProteinName",
13024     "The product name is missing from this protein."));
13025     //AddChromosomeNoLocation(expected_errors, entry);
13026 
13027     eval = validator.Validate(seh, options);
13028     CheckErrors (*eval, expected_errors);
13029 
13030     scope.RemoveTopLevelSeqEntry(seh);
13031     prot_seq->SetSeq().ResetAnnot();
13032     seh = scope.AddTopLevelSeqEntry(*entry);
13033 
13034     eval = validator.Validate(seh, options);
13035     CheckErrors (*eval, expected_errors);
13036 
13037     CLEAR_ERRORS
13038 }
13039 
13040 
BOOST_AUTO_TEST_CASE(Test_FEAT_OrfCdsHasProduct)13041 BOOST_AUTO_TEST_CASE(Test_FEAT_OrfCdsHasProduct)
13042 {
13043     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
13044     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
13045     cds->SetData().SetCdregion().SetOrf(true);
13046 
13047     STANDARD_SETUP
13048 
13049     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "OrfCdsHasProduct",
13050     "An ORF coding region should not have a product"));
13051     //AddChromosomeNoLocation(expected_errors, entry);
13052 
13053     eval = validator.Validate(seh, options);
13054     CheckErrors (*eval, expected_errors);
13055 
13056     CLEAR_ERRORS
13057 }
13058 
13059 
BOOST_AUTO_TEST_CASE(Test_FEAT_GeneRefHasNoData)13060 BOOST_AUTO_TEST_CASE(Test_FEAT_GeneRefHasNoData)
13061 {
13062     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
13063     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
13064     CRef<CSeq_feat> gene = unit_test_util::AddMiscFeature (nuc);
13065     gene->SetData().SetGene();
13066     gene->SetLocation().SetInt().SetTo(26);
13067 
13068     STANDARD_SETUP
13069 
13070     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "GeneRefHasNoData",
13071     "There is a gene feature where all fields are empty"));
13072     //AddChromosomeNoLocation(expected_errors, entry);
13073 
13074     eval = validator.Validate(seh, options);
13075     CheckErrors (*eval, expected_errors);
13076 
13077     CLEAR_ERRORS
13078 }
13079 
13080 
BOOST_AUTO_TEST_CASE(Test_FEAT_ExceptInconsistent)13081 BOOST_AUTO_TEST_CASE(Test_FEAT_ExceptInconsistent)
13082 {
13083     string except_text = "trans-splicing";
13084     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
13085     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
13086     cds->AddQualifier("exception", except_text);
13087 
13088     STANDARD_SETUP
13089 
13090     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "MissingExceptionFlag",
13091                               "Exception flag should be set in coding region"));
13092     //AddChromosomeNoLocation(expected_errors, entry);
13093 
13094     eval = validator.Validate(seh, options);
13095     CheckErrors (*eval, expected_errors);
13096 
13097     CLEAR_ERRORS
13098 
13099     cds->ResetQual();
13100     cds->SetExcept_text(except_text);
13101     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "MissingExceptionFlag",
13102                               "Exception text is present, but exception flag is not set"));
13103     //AddChromosomeNoLocation(expected_errors, entry);
13104 
13105     eval = validator.Validate(seh, options);
13106     CheckErrors (*eval, expected_errors);
13107 
13108     CLEAR_ERRORS
13109 
13110     cds->ResetExcept_text();
13111     cds->SetExcept(true);
13112 
13113     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "ExceptionMissingText",
13114                               "Exception flag is set, but exception text is empty"));
13115     //AddChromosomeNoLocation(expected_errors, entry);
13116 
13117     eval = validator.Validate(seh, options);
13118     CheckErrors (*eval, expected_errors);
13119     CLEAR_ERRORS
13120 }
13121 
13122 
BOOST_AUTO_TEST_CASE(Test_FEAT_ProtRefHasNoData)13123 BOOST_AUTO_TEST_CASE(Test_FEAT_ProtRefHasNoData)
13124 {
13125     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
13126     CRef<CSeq_feat> prot_feat = entry->SetSet().SetSeq_set().back()->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
13127     prot_feat->SetData().SetProt().Reset();
13128 
13129     STANDARD_SETUP
13130 
13131     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "ProtRefHasNoData",
13132                               "There is a protein feature where all fields are empty"));
13133     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "NoNameForProtein",
13134                               "Protein feature has no name"));
13135     //AddChromosomeNoLocation(expected_errors, entry);
13136 
13137     eval = validator.Validate(seh, options);
13138     CheckErrors (*eval, expected_errors);
13139 
13140     CLEAR_ERRORS
13141 }
13142 
13143 
BOOST_AUTO_TEST_CASE(Test_FEAT_GenCodeMismatch)13144 BOOST_AUTO_TEST_CASE(Test_FEAT_GenCodeMismatch)
13145 {
13146     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
13147     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
13148     CRef< CGenetic_code::C_E > ce(new CGenetic_code::C_E);
13149     ce->SetId(3);
13150     CRef<CGenetic_code> gcode(new CGenetic_code());
13151     cds->SetData().SetCdregion().SetCode().Set().push_back(ce);
13152     unit_test_util::SetGenome (entry, CBioSource::eGenome_apicoplast);
13153     unit_test_util::SetGcode (entry, 2);
13154     CRef<CSeq_entry> prot_seq = entry->SetSet().SetSeq_set().back();
13155     prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set()[6] = 'M';
13156 
13157     STANDARD_SETUP
13158     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "GenCodeMismatch",
13159                               "Genetic code conflict between CDS (code 3) and BioSource.genome biological context (apicoplast) (uses code 11)"));
13160     //AddChromosomeNoLocation(expected_errors, entry);
13161 
13162     eval = validator.Validate(seh, options);
13163     CheckErrors (*eval, expected_errors);
13164 
13165     unit_test_util::SetGenome (entry, CBioSource::eGenome_unknown);
13166 
13167     expected_errors[0]->SetErrMsg("Genetic code conflict between CDS (code 3) and BioSource (code 2)");
13168 
13169     eval = validator.Validate(seh, options);
13170     CheckErrors (*eval, expected_errors);
13171 
13172     CLEAR_ERRORS
13173 
13174     // ignore gencode mismatch for specified exception text
13175     cds->SetExcept(true);
13176     cds->SetExcept_text("genetic code exception");
13177     //AddChromosomeNoLocation(expected_errors, entry);
13178     eval = validator.Validate(seh, options);
13179     CheckErrors (*eval, expected_errors);
13180 
13181     CLEAR_ERRORS
13182 }
13183 
13184 
BOOST_AUTO_TEST_CASE(Test_FEAT_RNAtype0)13185 BOOST_AUTO_TEST_CASE(Test_FEAT_RNAtype0)
13186 {
13187     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
13188     CRef<CSeq_feat> rna = unit_test_util::AddMiscFeature (entry);
13189     rna->SetData().SetRna().SetType(CRNA_ref::eType_unknown);
13190 
13191     STANDARD_SETUP
13192     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "RNAtype0",
13193                               "RNA type 0 (unknown) not supported"));
13194     //AddChromosomeNoLocation(expected_errors, entry);
13195 
13196     eval = validator.Validate(seh, options);
13197     CheckErrors (*eval, expected_errors);
13198 
13199     CLEAR_ERRORS
13200 }
13201 
13202 
BOOST_AUTO_TEST_CASE(Test_FEAT_UnknownImpFeatKey)13203 BOOST_AUTO_TEST_CASE(Test_FEAT_UnknownImpFeatKey)
13204 {
13205     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
13206     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature (entry);
13207     misc->SetData().SetImp().SetKey("bad value");
13208 
13209     STANDARD_SETUP
13210     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnknownImpFeatKey",
13211                               "Unknown feature key bad value"));
13212 
13213     //AddChromosomeNoLocation(expected_errors, entry);
13214     eval = validator.Validate(seh, options);
13215     CheckErrors (*eval, expected_errors);
13216 
13217     scope.RemoveTopLevelSeqEntry(seh);
13218     misc->SetData().SetImp().SetKey("");
13219     seh = scope.AddTopLevelSeqEntry(*entry);
13220     expected_errors[0]->SetErrMsg("NULL feature key");
13221     eval = validator.Validate(seh, options);
13222     CheckErrors (*eval, expected_errors);
13223 
13224     vector<string> illegal_keys;
13225     illegal_keys.push_back ("virion");
13226     illegal_keys.push_back ("mutation");
13227     illegal_keys.push_back ("allele");
13228     illegal_keys.push_back ("Import");
13229 
13230     expected_errors[0]->SetSeverity(eDiag_Error);
13231     ITERATE (vector<string>, it, illegal_keys) {
13232         scope.RemoveTopLevelSeqEntry(seh);
13233         misc->SetData().SetImp().SetKey(*it);
13234         seh = scope.AddTopLevelSeqEntry(*entry);
13235         expected_errors[0]->SetErrMsg("Feature key " + *it + " is no longer legal");
13236         eval = validator.Validate(seh, options);
13237         CheckErrors (*eval, expected_errors);
13238     }
13239 
13240     CLEAR_ERRORS
13241 }
13242 
13243 
BOOST_AUTO_TEST_CASE(Test_FEAT_UnknownImpFeatQual)13244 BOOST_AUTO_TEST_CASE(Test_FEAT_UnknownImpFeatQual)
13245 {
13246     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
13247     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature (entry);
13248     misc->AddQualifier("bad name", "some value");
13249 
13250     STANDARD_SETUP
13251     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnknownImpFeatQual",
13252                               "Unknown qualifier bad name"));
13253     //AddChromosomeNoLocation(expected_errors, entry);
13254 
13255     eval = validator.Validate(seh, options);
13256     CheckErrors (*eval, expected_errors);
13257 
13258     misc->SetQual().front()->SetQual("");
13259     expected_errors[0]->SetErrMsg("NULL qualifier");
13260     eval = validator.Validate(seh, options);
13261     CheckErrors (*eval, expected_errors);
13262 
13263     CLEAR_ERRORS
13264 }
13265 
13266 
13267 // begin automatically generated section
BOOST_AUTO_TEST_CASE(Test_FEAT_MissingQualOnImpFeat)13268 BOOST_AUTO_TEST_CASE(Test_FEAT_MissingQualOnImpFeat)
13269 {
13270 
13271     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
13272     CRef<CSeq_feat> misc_feat = unit_test_util::AddMiscFeature (entry);
13273 
13274     STANDARD_SETUP
13275 
13276     scope.RemoveTopLevelSeqEntry (seh);
13277     entry = unit_test_util::BuildGoodSeq();
13278     misc_feat = unit_test_util::AddMiscFeature (entry);
13279     misc_feat->SetData().SetImp().SetKey("conflict");
13280     seh = scope.AddTopLevelSeqEntry (*entry);
13281     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingQualOnImpFeat",
13282                               "Missing qualifier citation for feature conflict"));
13283     //AddChromosomeNoLocation(expected_errors, entry);
13284     eval = validator.Validate(seh, options);
13285     CheckErrors (*eval, expected_errors);
13286     CLEAR_ERRORS
13287     scope.RemoveTopLevelSeqEntry (seh);
13288     entry = unit_test_util::BuildGoodSeq();
13289     misc_feat = unit_test_util::AddMiscFeature (entry);
13290     misc_feat->SetData().SetImp().SetKey("misc_binding");
13291     seh = scope.AddTopLevelSeqEntry (*entry);
13292     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingQualOnImpFeat",
13293                               "Missing qualifier bound_moiety for feature misc_binding"));
13294     //AddChromosomeNoLocation(expected_errors, entry);
13295     eval = validator.Validate(seh, options);
13296     CheckErrors (*eval, expected_errors);
13297     CLEAR_ERRORS
13298     scope.RemoveTopLevelSeqEntry (seh);
13299     entry = unit_test_util::BuildGoodSeq();
13300     misc_feat = unit_test_util::AddMiscFeature (entry);
13301     misc_feat->SetData().SetImp().SetKey("modified_base");
13302     seh = scope.AddTopLevelSeqEntry (*entry);
13303     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingQualOnImpFeat",
13304                               "Missing qualifier mod_base for feature modified_base"));
13305     //AddChromosomeNoLocation(expected_errors, entry);
13306     eval = validator.Validate(seh, options);
13307     CheckErrors (*eval, expected_errors);
13308     CLEAR_ERRORS
13309     scope.RemoveTopLevelSeqEntry (seh);
13310     entry = unit_test_util::BuildGoodSeq();
13311     misc_feat = unit_test_util::AddMiscFeature (entry);
13312     misc_feat->SetData().SetImp().SetKey("old_sequence");
13313     seh = scope.AddTopLevelSeqEntry (*entry);
13314     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingQualOnImpFeat",
13315                               "Missing qualifier citation for feature old_sequence"));
13316     //AddChromosomeNoLocation(expected_errors, entry);
13317     eval = validator.Validate(seh, options);
13318     CheckErrors (*eval, expected_errors);
13319     CLEAR_ERRORS
13320     scope.RemoveTopLevelSeqEntry (seh);
13321     entry = unit_test_util::BuildGoodSeq();
13322     misc_feat = unit_test_util::AddMiscFeature (entry);
13323     misc_feat->SetData().SetImp().SetKey("operon");
13324     seh = scope.AddTopLevelSeqEntry (*entry);
13325     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingQualOnImpFeat",
13326                               "Missing qualifier operon for feature operon"));
13327     //AddChromosomeNoLocation(expected_errors, entry);
13328     eval = validator.Validate(seh, options);
13329     CheckErrors (*eval, expected_errors);
13330     CLEAR_ERRORS
13331     scope.RemoveTopLevelSeqEntry (seh);
13332     entry = unit_test_util::BuildGoodSeq();
13333     misc_feat = unit_test_util::AddMiscFeature (entry);
13334     misc_feat->SetData().SetImp().SetKey("protein_bind");
13335     seh = scope.AddTopLevelSeqEntry (*entry);
13336     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingQualOnImpFeat",
13337                               "Missing qualifier bound_moiety for feature protein_bind"));
13338     //AddChromosomeNoLocation(expected_errors, entry);
13339     eval = validator.Validate(seh, options);
13340     CheckErrors (*eval, expected_errors);
13341     CLEAR_ERRORS
13342     scope.RemoveTopLevelSeqEntry (seh);
13343     entry = unit_test_util::BuildGoodSeq();
13344     misc_feat = unit_test_util::AddMiscFeature (entry);
13345     misc_feat->SetData().SetImp().SetKey("source");
13346     seh = scope.AddTopLevelSeqEntry (*entry);
13347     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingQualOnImpFeat",
13348                               "Missing qualifier organism for feature source"));
13349     //AddChromosomeNoLocation(expected_errors, entry);
13350     eval = validator.Validate(seh, options);
13351     CheckErrors (*eval, expected_errors);
13352     CLEAR_ERRORS
13353 }
13354 //end automatically generated section
13355 
13356 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PseudoCdsHasProduct)13357 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PseudoCdsHasProduct)
13358 {
13359     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
13360     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
13361     cds->SetPseudo(true);
13362     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
13363     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature(cds);
13364     gene->SetPseudo(true);
13365     unit_test_util::AddFeat (gene, nuc);
13366 
13367     STANDARD_SETUP
13368 
13369     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PseudoCdsHasProduct", "A pseudo coding region should not have a product"));
13370     //AddChromosomeNoLocation(expected_errors, entry);
13371     eval = validator.Validate(seh, options);
13372     CheckErrors (*eval, expected_errors);
13373 
13374     gene->SetPseudo(false);
13375     eval = validator.Validate(seh, options);
13376     CheckErrors (*eval, expected_errors);
13377 
13378     scope.RemoveTopLevelSeqEntry(seh);
13379     entry = unit_test_util::BuildGoodNucProtSet();
13380     cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
13381     cds->SetPseudo(true);
13382     seh = scope.AddTopLevelSeqEntry(*entry);
13383     eval = validator.Validate(seh, options);
13384     CheckErrors (*eval, expected_errors);
13385 
13386     CLEAR_ERRORS
13387 }
13388 
13389 
MakeWrongCap(const string & str)13390 static string MakeWrongCap (const string& str)
13391 {
13392     string bad = "";
13393     char add[2];
13394     add[1] = 0;
13395 
13396     ITERATE(string, it, str) {
13397         add[0] = *it;
13398         if (isupper (*it)) {
13399             add[0] = tolower(*it);
13400         } else if (islower(*it)) {
13401             add[0] = toupper(*it);
13402         }
13403         bad.append(add);
13404     }
13405     return bad;
13406 }
13407 
13408 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_IllegalDbXref)13409 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_IllegalDbXref)
13410 {
13411     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
13412 
13413     STANDARD_SETUP
13414 
13415     vector<string> legal_strings;
13416     legal_strings.push_back ("AceView/WormGenes");
13417     legal_strings.push_back ("AFTOL");
13418     legal_strings.push_back ("AntWeb");
13419     legal_strings.push_back ("APHIDBASE");
13420     legal_strings.push_back ("ApiDB");
13421     legal_strings.push_back ("ApiDB_CryptoDB");
13422     legal_strings.push_back ("ApiDB_PlasmoDB");
13423     legal_strings.push_back ("ApiDB_ToxoDB");
13424     legal_strings.push_back ("ASAP");
13425     legal_strings.push_back ("ATCC");
13426     legal_strings.push_back ("ATCC(in host)");
13427     legal_strings.push_back ("ATCC(dna)");
13428     legal_strings.push_back ("Axeldb");
13429     legal_strings.push_back ("BDGP_EST");
13430     legal_strings.push_back ("BDGP_INS");
13431     legal_strings.push_back ("BEETLEBASE");
13432     legal_strings.push_back ("BOLD");
13433     legal_strings.push_back ("CDD");
13434     legal_strings.push_back ("CK");
13435     legal_strings.push_back ("COG");
13436     legal_strings.push_back ("dbClone");
13437     legal_strings.push_back ("dbCloneLib");
13438     legal_strings.push_back ("dbEST");
13439     legal_strings.push_back ("dbProbe");
13440     legal_strings.push_back ("dbSNP");
13441     legal_strings.push_back ("dbSTS");
13442     legal_strings.push_back ("dictyBase");
13443     legal_strings.push_back ("DDBJ");
13444     legal_strings.push_back ("EcoGene");
13445     legal_strings.push_back ("EMBL");
13446     // legal_strings.push_back ("ENSEMBL");
13447     legal_strings.push_back ("Ensembl");
13448     legal_strings.push_back ("ESTLIB");
13449     legal_strings.push_back ("FANTOM_DB");
13450     legal_strings.push_back ("FLYBASE");
13451     legal_strings.push_back ("GABI");
13452     legal_strings.push_back ("GDB");
13453     legal_strings.push_back ("GeneDB");
13454     legal_strings.push_back ("GeneID");
13455     legal_strings.push_back ("GO");
13456     legal_strings.push_back ("GOA");
13457     legal_strings.push_back ("Greengenes");
13458     legal_strings.push_back ("GRIN");
13459     legal_strings.push_back ("H-InvDB");
13460     legal_strings.push_back ("HGNC");
13461     legal_strings.push_back ("HMP");
13462     legal_strings.push_back ("HOMD");
13463     legal_strings.push_back ("HSSP");
13464     legal_strings.push_back ("IMGT/GENE-DB");
13465     legal_strings.push_back ("IMGT/HLA");
13466     legal_strings.push_back ("IMGT/LIGM");
13467     legal_strings.push_back ("InterimID");
13468     legal_strings.push_back ("InterPro");
13469     legal_strings.push_back ("IRD");
13470     legal_strings.push_back ("ISD");
13471     legal_strings.push_back ("ISFinder");
13472     legal_strings.push_back ("JCM");
13473     legal_strings.push_back ("JGIDB");
13474     legal_strings.push_back ("LocusID");
13475     legal_strings.push_back ("MaizeGDB");
13476     legal_strings.push_back ("MGI");
13477     legal_strings.push_back ("MIM");
13478     legal_strings.push_back ("miRBase");
13479     legal_strings.push_back ("MycoBank");
13480     legal_strings.push_back ("NBRC");
13481     legal_strings.push_back ("NextDB");
13482     legal_strings.push_back ("niaEST");
13483     legal_strings.push_back ("NMPDR");
13484     legal_strings.push_back ("NRESTdb");
13485     legal_strings.push_back ("Osa1");
13486     legal_strings.push_back ("Pathema");
13487     legal_strings.push_back ("PBmice");
13488     legal_strings.push_back ("PDB");
13489     legal_strings.push_back ("PFAM");
13490     legal_strings.push_back ("PGN");
13491     legal_strings.push_back ("PIR");
13492     legal_strings.push_back ("PSEUDO");
13493     legal_strings.push_back ("PseudoCap");
13494     legal_strings.push_back ("RAP-DB");
13495     legal_strings.push_back ("RATMAP");
13496     legal_strings.push_back ("RFAM");
13497     legal_strings.push_back ("RGD");
13498     legal_strings.push_back ("RiceGenes");
13499     legal_strings.push_back ("RZPD");
13500     legal_strings.push_back ("SEED");
13501     legal_strings.push_back ("SGD");
13502     legal_strings.push_back ("SGN");
13503     legal_strings.push_back ("SoyBase");
13504     legal_strings.push_back ("SubtiList");
13505     legal_strings.push_back ("TAIR");
13506     legal_strings.push_back ("taxon");
13507     legal_strings.push_back ("TIGRFAM");
13508     legal_strings.push_back ("UniGene");
13509     legal_strings.push_back ("UNILIB");
13510     legal_strings.push_back ("UniProtKB/Swiss-Prot");
13511     legal_strings.push_back ("UniProtKB/TrEMBL");
13512     legal_strings.push_back ("UniSTS");
13513     legal_strings.push_back ("UNITE");
13514     legal_strings.push_back ("VBASE2");
13515     legal_strings.push_back ("VectorBase");
13516     legal_strings.push_back ("WorfDB");
13517     legal_strings.push_back ("WormBase");
13518     legal_strings.push_back ("Xenbase");
13519     legal_strings.push_back ("ZFIN");
13520     vector<string> src_strings;
13521     src_strings.push_back ("AFTOL");
13522     src_strings.push_back ("AntWeb");
13523     src_strings.push_back ("ATCC");
13524     src_strings.push_back ("ATCC(dna)");
13525     src_strings.push_back ("ATCC(in host)");
13526     src_strings.push_back ("BOLD");
13527     src_strings.push_back ("FANTOM_DB");
13528     src_strings.push_back ("FLYBASE");
13529     src_strings.push_back ("Greengenes");
13530     src_strings.push_back ("GRIN");
13531     src_strings.push_back ("HMP");
13532     src_strings.push_back ("HOMD");
13533     src_strings.push_back ("IMGT/HLA");
13534     src_strings.push_back ("IMGT/LIGM");
13535     src_strings.push_back ("JCM");
13536     src_strings.push_back ("MGI");
13537     src_strings.push_back ("MycoBank");
13538     src_strings.push_back ("NBRC");
13539     src_strings.push_back ("RZPD");
13540     src_strings.push_back ("taxon");
13541     src_strings.push_back ("UNILIB");
13542     src_strings.push_back ("UNITE");
13543     vector<string> refseq_strings;
13544     refseq_strings.push_back ("CCDS");
13545     refseq_strings.push_back ("CGNC");
13546     refseq_strings.push_back ("CloneID");
13547     refseq_strings.push_back ("HPRD");
13548     refseq_strings.push_back ("LRG");
13549     refseq_strings.push_back ("PBR");
13550     refseq_strings.push_back ("REBASE");
13551     refseq_strings.push_back ("SK-FST");
13552     refseq_strings.push_back ("VBRC");
13553 
13554     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "IllegalDbXref",
13555                                          "db_xref type %s (1234) should not be used on an OrgRef"));
13556     //AddChromosomeNoLocation(expected_errors, entry);
13557 
13558     string bad;
13559     ITERATE (vector<string>, sit, src_strings) {
13560         if (NStr::Equal(*sit, "taxon")) {
13561             unit_test_util::RemoveDbxref (entry, *sit, 0);
13562         }
13563         bad = MakeWrongCap(*sit);
13564         unit_test_util::SetDbxref (entry, bad, 1234);
13565         expected_errors[0]->SetErrMsg("Illegal db_xref type " + bad + " (1234), legal capitalization is " + *sit);
13566         eval = validator.Validate(seh, options);
13567         CheckErrors (*eval, expected_errors);
13568         unit_test_util::RemoveDbxref (entry, bad, 0);
13569         if (NStr::Equal(*sit, "taxon")) {
13570             unit_test_util::SetTaxon(entry, 592768);
13571         }
13572     }
13573 
13574     ITERATE (vector<string>, sit, legal_strings) {
13575         bool found = false;
13576         ITERATE (vector<string>, ss, src_strings) {
13577             if (NStr::Equal(*ss, *sit)) {
13578                 found = true;
13579                 break;
13580             }
13581         }
13582         if (found) {
13583             continue;
13584         }
13585         bad = MakeWrongCap(*sit);
13586         unit_test_util::SetDbxref (entry, bad, 1234);
13587         expected_errors[0]->SetErrMsg("Illegal db_xref type " + bad + " (1234), legal capitalization is " + *sit
13588                                       + ", but should not be used on an OrgRef");
13589         eval = validator.Validate(seh, options);
13590         CheckErrors (*eval, expected_errors);
13591         unit_test_util::RemoveDbxref (entry, bad, 0);
13592 
13593         unit_test_util::SetDbxref (entry, *sit, 1234);
13594         expected_errors[0]->SetErrMsg("db_xref type " + *sit + " (1234) should not be used on an OrgRef");
13595         eval = validator.Validate(seh, options);
13596         CheckErrors (*eval, expected_errors);
13597         unit_test_util::RemoveDbxref (entry, *sit, 0);
13598     }
13599 
13600     ITERATE (vector<string>, sit, refseq_strings) {
13601         unit_test_util::SetDbxref (entry, *sit, 1234);
13602         expected_errors[0]->SetErrMsg("RefSeq-specific db_xref type " + *sit + " (1234) should not be used on a non-RefSeq OrgRef");
13603         eval = validator.Validate(seh, options);
13604         CheckErrors (*eval, expected_errors);
13605         unit_test_util::RemoveDbxref (entry, *sit, 0);
13606     }
13607 
13608     unit_test_util::SetDbxref (entry, "unrecognized", 1234);
13609     expected_errors[0]->SetErrMsg("Illegal db_xref type unrecognized (1234)");
13610     eval = validator.Validate(seh, options);
13611     CheckErrors (*eval, expected_errors);
13612     unit_test_util::RemoveDbxref (entry, "unrecognized", 0);
13613 
13614     scope.RemoveTopLevelSeqEntry(seh);
13615     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
13616     seh = scope.AddTopLevelSeqEntry(*entry);
13617     ChangeErrorAcc(expected_errors, "ref|NC_123456|");
13618     ITERATE (vector<string>, sit, refseq_strings) {
13619         unit_test_util::SetDbxref (entry, *sit, 1234);
13620         expected_errors[0]->SetErrMsg("RefSeq-specific db_xref type " + *sit + " (1234) should not be used on an OrgRef");
13621         eval = validator.Validate(seh, options);
13622         CheckErrors (*eval, expected_errors);
13623         unit_test_util::RemoveDbxref (entry, *sit, 0);
13624     }
13625 
13626     scope.RemoveTopLevelSeqEntry(seh);
13627     entry->SetSeq().SetId().front()->SetLocal().SetStr("good");
13628     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
13629     seh = scope.AddTopLevelSeqEntry(*entry);
13630     ChangeErrorAcc(expected_errors, "lcl|good");
13631 
13632     ITERATE (vector<string>, sit, legal_strings) {
13633         bad = MakeWrongCap(*sit);
13634         unit_test_util::SetDbxref (feat, bad, 1234);
13635         if (NStr::Equal(*sit, "taxon")) {
13636             expected_errors[0]->SetErrMsg("Illegal db_xref type TAXON (1234), legal capitalization is taxon, but should only be used on an OrgRef");
13637         } else {
13638             expected_errors[0]->SetErrMsg("Illegal db_xref type " + bad + " (1234), legal capitalization is " + *sit);
13639         }
13640         eval = validator.Validate(seh, options);
13641         CheckErrors (*eval, expected_errors);
13642         unit_test_util::RemoveDbxref (feat, bad, 0);
13643     }
13644 
13645     ITERATE (vector<string>, sit, refseq_strings) {
13646         unit_test_util::SetDbxref (feat, *sit, 1234);
13647         expected_errors[0]->SetErrMsg("db_xref type " + *sit + " (1234) is only legal for RefSeq");
13648         eval = validator.Validate(seh, options);
13649         CheckErrors (*eval, expected_errors);
13650         unit_test_util::RemoveDbxref (feat, *sit, 0);
13651     }
13652 
13653     unit_test_util::SetDbxref(feat, "taxon", 1234);
13654     expected_errors[0]->SetErrMsg("db_xref type taxon (1234) should only be used on an OrgRef");
13655     eval = validator.Validate(seh, options);
13656     CheckErrors (*eval, expected_errors);
13657     unit_test_util::RemoveDbxref (feat, "taxon", 0);
13658 
13659     unit_test_util::SetDbxref (feat, "unrecognized", 1234);
13660     expected_errors[0]->SetErrMsg("Illegal db_xref type unrecognized (1234)");
13661     eval = validator.Validate(seh, options);
13662     CheckErrors (*eval, expected_errors);
13663     unit_test_util::RemoveDbxref (feat, "unrecognized", 0);
13664 
13665     CLEAR_ERRORS
13666 }
13667 
13668 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FarLocation)13669 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FarLocation)
13670 {
13671     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
13672     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature (entry);
13673     misc->SetLocation().Assign(*unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front()));
13674     misc->SetLocation().SetMix().Set().back()->SetInt().SetId().SetGenbank().SetAccession("AY123456");
13675 
13676     STANDARD_SETUP
13677 
13678     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FarLocation", "Feature has 'far' location - accession not packaged in record"));
13679     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadLocation", "Feature location intervals should all be on the same sequence"));
13680     //AddChromosomeNoLocation(expected_errors, entry);
13681     eval = validator.Validate(seh, options);
13682     CheckErrors (*eval, expected_errors);
13683 
13684     CLEAR_ERRORS
13685 }
13686 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateFeat)13687 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateFeat)
13688 {
13689     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
13690     CRef<CSeq_feat> feat1 = unit_test_util::AddMiscFeature (entry);
13691     CRef<CSeq_feat> feat2 = unit_test_util::AddMiscFeature (entry);
13692     feat2->SetComment("a");
13693 
13694     STANDARD_SETUP
13695 
13696     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "DuplicateFeat", "Features have identical intervals, but labels differ"));
13697     //AddChromosomeNoLocation(expected_errors, entry);
13698     eval = validator.Validate(seh, options);
13699     CheckErrors (*eval, expected_errors);
13700 
13701     // error if genbank accession
13702     scope.RemoveTopLevelSeqEntry(seh);
13703     entry = unit_test_util::BuildGoodSeq();
13704     entry->SetSeq().SetId().front()->SetGenbank().SetAccession("AY123456");
13705     feat1 = unit_test_util::AddMiscFeature (entry);
13706     feat1->SetData().SetGene().SetLocus("locus1");
13707     feat2 = unit_test_util::AddMiscFeature (entry);
13708     feat2->SetData().SetGene().SetLocus("locus2");
13709     seh = scope.AddTopLevelSeqEntry(*entry);
13710     ChangeErrorAcc(expected_errors, "gb|AY123456|");
13711     eval = validator.Validate(seh, options);
13712     CheckErrors (*eval, expected_errors);
13713 
13714     unit_test_util::SetDrosophila_melanogaster (entry);
13715     expected_errors[0]->SetSeverity(eDiag_Warning);
13716     eval = validator.Validate(seh, options);
13717     CheckErrors (*eval, expected_errors);
13718 
13719     // warning if genes are partial
13720     unit_test_util::SetSebaea_microphylla(entry);
13721     feat1->SetPartial(true);
13722     feat1->SetLocation().SetPartialStart(true, eExtreme_Biological);
13723     feat2->SetPartial(true);
13724     feat2->SetLocation().SetPartialStart(true, eExtreme_Biological);
13725     expected_errors[0]->SetSeverity(eDiag_Warning);
13726     eval = validator.Validate(seh, options);
13727     CheckErrors (*eval, expected_errors);
13728 
13729     // warning if genes are pseudo
13730     feat1->SetPartial(false);
13731     feat1->SetLocation().SetPartialStart(false, eExtreme_Biological);
13732     feat2->SetPartial(false);
13733     feat2->SetLocation().SetPartialStart(false, eExtreme_Biological);
13734     feat1->SetPseudo(true);
13735     feat2->SetPseudo(true);
13736     expected_errors[0]->SetSeverity(eDiag_Warning);
13737     eval = validator.Validate(seh, options);
13738     CheckErrors (*eval, expected_errors);
13739 
13740     // error if general ID
13741     scope.RemoveTopLevelSeqEntry(seh);
13742     entry = unit_test_util::BuildGoodSeq();
13743     entry->SetSeq().SetId().front()->SetGeneral().SetDb("abc");
13744     entry->SetSeq().SetId().front()->SetGeneral().SetTag().SetId(123456);
13745     feat1 = unit_test_util::AddMiscFeature (entry);
13746     feat1->SetData().SetGene().SetLocus("locus1");
13747     feat2 = unit_test_util::AddMiscFeature (entry);
13748     feat2->SetData().SetGene().SetLocus("locus2");
13749     seh = scope.AddTopLevelSeqEntry(*entry);
13750     ChangeErrorAcc(expected_errors, "gnl|abc|123456");
13751     eval = validator.Validate(seh, options);
13752     CheckErrors (*eval, expected_errors);
13753 
13754     unit_test_util::SetDrosophila_melanogaster (entry);
13755     expected_errors[0]->SetSeverity(eDiag_Warning);
13756     eval = validator.Validate(seh, options);
13757     CheckErrors (*eval, expected_errors);
13758 
13759     // warning if genes are partial
13760     unit_test_util::SetSebaea_microphylla(entry);
13761     feat1->SetPartial(true);
13762     feat1->SetLocation().SetPartialStart(true, eExtreme_Biological);
13763     feat2->SetPartial(true);
13764     feat2->SetLocation().SetPartialStart(true, eExtreme_Biological);
13765     expected_errors[0]->SetSeverity(eDiag_Warning);
13766     eval = validator.Validate(seh, options);
13767     CheckErrors (*eval, expected_errors);
13768 
13769     // warning if genes are pseudo
13770     feat1->SetPartial(false);
13771     feat1->SetLocation().SetPartialStart(false, eExtreme_Biological);
13772     feat2->SetPartial(false);
13773     feat2->SetLocation().SetPartialStart(false, eExtreme_Biological);
13774     feat1->SetPseudo(true);
13775     feat2->SetPseudo(true);
13776     expected_errors[0]->SetSeverity(eDiag_Warning);
13777     eval = validator.Validate(seh, options);
13778     CheckErrors (*eval, expected_errors);
13779 
13780 
13781     // always warning if on different annots
13782     scope.RemoveTopLevelSeqEntry(seh);
13783     entry = unit_test_util::BuildGoodSeq();
13784     feat1 = unit_test_util::AddMiscFeature (entry);
13785     CRef<CSeq_annot> annot2(new CSeq_annot());
13786     feat2->Assign(*feat1);
13787     feat2->SetComment("a");
13788     annot2->SetData().SetFtable().push_back(feat2);
13789     entry->SetSeq().SetAnnot().push_back(annot2);
13790     seh = scope.AddTopLevelSeqEntry(*entry);
13791     ChangeErrorAcc(expected_errors, "lcl|good");
13792     expected_errors[0]->SetSeverity(eDiag_Warning);
13793     expected_errors[0]->SetErrMsg("Features have identical intervals, but labels differ (packaged in different feature table)");
13794     eval = validator.Validate(seh, options);
13795     CheckErrors (*eval, expected_errors);
13796 
13797     CLEAR_ERRORS
13798 }
13799 
13800 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnnecessaryGeneXref)13801 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnnecessaryGeneXref)
13802 {
13803     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
13804     CRef<CSeq_feat> feat1 = unit_test_util::AddMiscFeature (entry, 15);
13805     CRef<CSeq_feat> gene = unit_test_util::AddMiscFeature (entry, 15);
13806     gene->SetData().SetGene().SetLocus("foo");
13807     feat1->SetGeneXref().SetLocus("foo");
13808 
13809     STANDARD_SETUP
13810 
13811     eval = validator.Validate(seh, options);
13812     //AddChromosomeNoLocation(expected_errors, entry);
13813     CheckErrors (*eval, expected_errors);
13814 
13815     // now gene xref is necessary
13816     scope.RemoveTopLevelSeqEntry(seh);
13817     CRef<CSeq_feat> gene2 = unit_test_util::AddMiscFeature (entry, 15);
13818     gene2->SetLocation().SetPartialStart(true, eExtreme_Biological);
13819     gene2->SetPartial(true);
13820     gene2->SetData().SetGene().SetLocus("bar");
13821     seh = scope.AddTopLevelSeqEntry(*entry);
13822     eval = validator.Validate(seh, options);
13823     CLEAR_ERRORS
13824     //AddChromosomeNoLocation(expected_errors, entry);
13825     CheckErrors (*eval, expected_errors);
13826 
13827     // error if gene references itself
13828     gene2->SetGeneXref().SetLocus("bar");
13829     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnnecessaryGeneXref",
13830                               "Gene feature has gene cross-reference"));
13831     eval = validator.Validate(seh, options);
13832     CheckErrors (*eval, expected_errors);
13833 
13834     CLEAR_ERRORS
13835 }
13836 
13837 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TranslExceptPhase)13838 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TranslExceptPhase)
13839 {
13840     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
13841     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
13842     CRef<CSeq_entry> nuc = GetNucleotideSequenceFromGoodNucProtSet(entry);
13843     CRef<CSeq_feat> gene = MakeGeneForFeature(cds);
13844     gene->SetData().SetGene().SetLocus_tag("xyz");
13845     AddFeat(gene, nuc);
13846 
13847     CRef<CCode_break> codebreak(new CCode_break());
13848     codebreak->SetLoc().SetInt().SetId().SetLocal().SetStr("nuc");
13849     codebreak->SetLoc().SetInt().SetFrom(4);
13850     codebreak->SetLoc().SetInt().SetTo(6);
13851     cds->SetData().SetCdregion().SetCode_break().push_back(codebreak);
13852 
13853     STANDARD_SETUP
13854 
13855     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "CDSrange",
13856                                                  "Code-break location not in coding region - may be frame problem"));
13857     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranslExceptPhase",
13858                               "transl_except qual out of frame."));
13859     //AddChromosomeNoLocation(expected_errors, entry);
13860     eval = validator.Validate(seh, options);
13861     CheckErrors (*eval, expected_errors);
13862 
13863 
13864     CValidErrorFormat format(*objmgr);
13865     vector<string> expected;
13866     expected.push_back("TranslExceptPhase");
13867     expected.push_back("lcl|nuc:CDS\t fake protein name\tlcl|nuc:1-27\txyz");
13868     expected.push_back("");
13869     expected.push_back("CDSrange");
13870     expected.push_back("lcl|nuc:CDS\t fake protein name\tlcl|nuc:1-27\txyz");
13871     expected.push_back("");
13872     vector<string> seen;
13873     vector<string> cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
13874     ITERATE(vector<string>, it, cat_list) {
13875         vector<string> sublist;
13876         NStr::Split(*it, "\n", sublist, 0);
13877         ITERATE(vector<string>, sit, sublist) {
13878             seen.push_back(*sit);
13879         }
13880     }
13881 
13882     CheckStrings(seen, expected);
13883 
13884 
13885     // only see locus tags when requested
13886     for (auto it : eval->GetErrs()) {
13887         BOOST_CHECK_EQUAL(it->IsSetLocus_tag(), false);
13888     }
13889 
13890     eval = validator.Validate(seh, options | CValidator::eVal_collect_locus_tags);
13891     CheckErrors(*eval, expected_errors);
13892     for (auto it : eval->GetErrs()) {
13893         if (!NStr::Equal(it->GetErrCode(), "ChromosomeWithoutLocation")) {
13894             BOOST_CHECK_EQUAL(it->IsSetLocus_tag(), true);
13895             BOOST_CHECK_EQUAL(it->GetLocus_tag(), "xyz");
13896         }
13897     }
13898 
13899     expected.clear();
13900     expected.push_back("TranslExceptPhase");
13901     expected.push_back("lcl|nuc:CDS\t fake protein name\tlcl|nuc:1-27\txyz");
13902     expected.push_back("");
13903     expected.push_back("CDSrange");
13904     expected.push_back("lcl|nuc:CDS\t fake protein name\tlcl|nuc:1-27\txyz");
13905     expected.push_back("");
13906 
13907     cat_list = format.FormatCompleteSubmitterReport(*eval, scope);
13908     seen.clear();
13909     ITERATE(vector<string>, it, cat_list) {
13910         vector<string> sublist;
13911         NStr::Split(*it, "\n", sublist, 0);
13912         ITERATE(vector<string>, sit, sublist) {
13913             seen.push_back(*sit);
13914         }
13915     }
13916 
13917     CheckStrings(seen, expected);
13918 
13919     CLEAR_ERRORS
13920 }
13921 
13922 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TrnaCodonWrong)13923 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TrnaCodonWrong)
13924 {
13925     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
13926     CRef<CSeq_feat> trna = unit_test_util::AddMiscFeature (entry);
13927     trna->SetData().SetRna().SetType(CRNA_ref::eType_tRNA);
13928     trna->SetData().SetRna().SetExt().SetTRNA().SetCodon().push_back(0);
13929     trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('A');
13930 
13931     STANDARD_SETUP
13932 
13933     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TrnaCodonWrong",
13934                       "Codon recognized by tRNA (UUU) does not match amino acid (A/Ala) specified by genetic code (1/Standard)"));
13935     //AddChromosomeNoLocation(expected_errors, entry);
13936     eval = validator.Validate(seh, options);
13937     CheckErrors (*eval, expected_errors);
13938 
13939     // drop to warning if aa is 'U' or 'O'
13940     trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('U');
13941     expected_errors[0]->SetSeverity(eDiag_Warning);
13942     expected_errors[0]->SetErrMsg("Codon recognized by tRNA (UUU) does not match amino acid (U/Sec) specified by genetic code (1/Standard)");
13943     eval = validator.Validate(seh, options);
13944     CheckErrors (*eval, expected_errors);
13945 
13946     trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('O');
13947     expected_errors[0]->SetErrMsg("Codon recognized by tRNA (UUU) does not match amino acid (O/Pyl) specified by genetic code (1/Standard)");
13948     eval = validator.Validate(seh, options);
13949     CheckErrors (*eval, expected_errors);
13950 
13951     CLEAR_ERRORS
13952 }
13953 
13954 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BothStrands)13955 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BothStrands)
13956 {
13957     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
13958     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature (entry);
13959     feat->SetData().SetGene().SetLocus("X");
13960     feat->SetLocation().SetInt().SetStrand(eNa_strand_both);
13961 
13962     STANDARD_SETUP
13963 
13964     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BothStrands",
13965                       "gene may not be on both (forward) strands"));
13966     //AddChromosomeNoLocation(expected_errors, entry);
13967     eval = validator.Validate(seh, options);
13968     CheckErrors (*eval, expected_errors);
13969 
13970     scope.RemoveTopLevelSeqEntry(seh);
13971     entry = unit_test_util::BuildGoodSeq();
13972     feat = unit_test_util::AddMiscFeature (entry);
13973     feat->SetData().SetGene().SetLocus("X");
13974     feat->SetLocation().Assign(*unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front()));
13975     feat->SetLocation().SetMix().Set().front()->SetInt().SetStrand(eNa_strand_both);
13976     feat->SetLocation().SetMix().Set().back()->SetInt().SetStrand(eNa_strand_both_rev);
13977     // set trans-splicing exception to prevent mixed-strand error
13978     feat->SetExcept(true);
13979     feat->SetExcept_text("trans-splicing");
13980     seh = scope.AddTopLevelSeqEntry(*entry);
13981     expected_errors[0]->SetErrMsg("gene may not be on both (forward and reverse) strands");
13982     eval = validator.Validate(seh, options);
13983     CheckErrors (*eval, expected_errors);
13984 
13985     scope.RemoveTopLevelSeqEntry(seh);
13986     entry = unit_test_util::BuildGoodSeq();
13987     feat = unit_test_util::AddMiscFeature (entry);
13988     feat->SetData().SetGene().SetLocus("X");
13989     feat->SetLocation().Assign(*unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front()));
13990     feat->SetLocation().SetMix().Set().front()->SetInt().SetStrand(eNa_strand_both_rev);
13991     feat->SetLocation().SetMix().Set().back()->SetInt().SetStrand(eNa_strand_both_rev);
13992     seh = scope.AddTopLevelSeqEntry(*entry);
13993     expected_errors[0]->SetErrMsg("gene may not be on both (reverse) strands");
13994     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultiIntervalGene",
13995         "Gene feature on non-segmented sequence should not have multiple intervals"));
13996     eval = validator.Validate(seh, options);
13997     CheckErrors (*eval, expected_errors);
13998 
13999     CLEAR_ERRORS
14000 
14001     scope.RemoveTopLevelSeqEntry(seh);
14002     entry = unit_test_util::BuildGoodSeq();
14003     feat = unit_test_util::AddMiscFeature (entry);
14004     feat->SetLocation().Assign(*unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front()));
14005     feat->SetLocation().SetMix().Set().front()->SetInt().SetStrand(eNa_strand_both);
14006     feat->SetLocation().SetMix().Set().back()->SetInt().SetStrand(eNa_strand_both_rev);
14007     feat->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
14008     feat->SetData().SetRna().SetExt().SetName("mRNA product");
14009     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature (feat);
14010     unit_test_util::AddFeat(gene, entry);
14011     // make pseudo to prevent splice errors
14012     feat->SetPseudo(true);
14013     // set trans-splicing exception to prevent mixed-strand error
14014     feat->SetExcept(true);
14015     feat->SetExcept_text("trans-splicing");
14016     seh = scope.AddTopLevelSeqEntry(*entry);
14017 
14018     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
14019                               "No CDS location match for 1 mRNA"));
14020     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BothStrands",
14021                       "mRNA may not be on both (forward and reverse) strands"));
14022     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "mRNAgeneRange",
14023                       "gene [gene locus:lcl|good:1-57] overlaps mRNA but does not completely contain it"));
14024     //AddChromosomeNoLocation(expected_errors, entry);
14025 
14026     eval = validator.Validate(seh, options);
14027     CheckErrors (*eval, expected_errors);
14028 
14029     CLEAR_ERRORS
14030 
14031     scope.RemoveTopLevelSeqEntry(seh);
14032     entry = unit_test_util::BuildGoodSeq();
14033     feat = unit_test_util::AddMiscFeature (entry);
14034     feat->SetLocation().Assign(*unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front()));
14035     feat->SetLocation().SetMix().Set().front()->SetInt().SetStrand(eNa_strand_both_rev);
14036     feat->SetLocation().SetMix().Set().back()->SetInt().SetStrand(eNa_strand_both_rev);
14037     feat->SetPseudo(true);
14038     feat->SetData().SetCdregion();
14039     seh = scope.AddTopLevelSeqEntry(*entry);
14040     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BothStrands",
14041                       "CDS may not be on both (reverse) strands"));
14042     //AddChromosomeNoLocation(expected_errors, entry);
14043     eval = validator.Validate(seh, options);
14044     CheckErrors (*eval, expected_errors);
14045 
14046     CLEAR_ERRORS
14047 }
14048 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSmRNArange)14049 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSmRNArange)
14050 {
14051     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
14052     CRef<CSeq_entry> nuc_seq = entry->SetSet().SetSeq_set().front();
14053     unit_test_util::SetSpliceForMixLoc (nuc_seq->SetSeq());
14054     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
14055     cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc_seq->SetSeq().SetId().front()));
14056     CRef<CSeq_feat> mrna(new CSeq_feat());
14057     mrna->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc_seq->SetSeq().SetId().front()));
14058     mrna->SetData().SetRna().SetType (CRNA_ref::eType_mRNA);
14059     mrna->SetData().SetRna().SetExt().SetName("mRNA product");
14060     mrna->SetLocation().SetMix().Set().front()->SetInt().SetTo(17);
14061     nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[18] = 'G';
14062     nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[19] = 'T';
14063     unit_test_util::AddFeat (mrna, entry->SetSet().SetSeq_set().front());
14064 
14065     STANDARD_SETUP
14066 
14067     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSwithNoMRNA",
14068                               "Unmatched CDS"));
14069     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNAMismatchLocation",
14070                               "No CDS location match for 1 mRNA"));
14071     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNArange",
14072                       "mRNA contains CDS but internal intron-exon boundaries do not match"));
14073     //AddChromosomeNoLocation(expected_errors, entry);
14074     eval = validator.Validate(seh, options);
14075     CheckErrors (*eval, expected_errors);
14076 
14077     // turn off error for ribosomal slippage and trans-splicing
14078     CLEAR_ERRORS
14079     scope.RemoveTopLevelSeqEntry(seh);
14080     cds->SetExcept(true);
14081     cds->SetExcept_text("ribosomal slippage");
14082     nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[16] = 'A';
14083     seh = scope.AddTopLevelSeqEntry(*entry);
14084     //AddChromosomeNoLocation(expected_errors, entry);
14085     eval = validator.Validate(seh, options);
14086     CheckErrors (*eval, expected_errors);
14087 
14088     cds->SetExcept_text("trans-splicing");
14089     nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[16] = 'G';
14090     eval = validator.Validate(seh, options);
14091     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSwithNoMRNA",
14092                               "Unmatched CDS"));
14093     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNAMismatchLocation",
14094                               "No CDS location match for 1 mRNA"));
14095     CheckErrors(*eval, expected_errors);
14096 
14097     // overlap problem rather than internal boundary problem
14098     scope.RemoveTopLevelSeqEntry(seh);
14099     entry = unit_test_util::BuildGoodNucProtSet();
14100     nuc_seq = entry->SetSet().SetSeq_set().front();
14101     unit_test_util::SetSpliceForMixLoc (nuc_seq->SetSeq());
14102     cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
14103     cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc_seq->SetSeq().SetId().front()));
14104     mrna = new CSeq_feat();
14105     mrna->SetLocation().Assign(cds->GetLocation());
14106     mrna->SetData().SetRna().SetType (CRNA_ref::eType_mRNA);
14107     mrna->SetData().SetRna().SetExt().SetName("mRNA product");
14108     mrna->SetLocation().SetMix().Set().front()->SetInt().SetTo(12);
14109     nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[13] = 'G';
14110     nuc_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[14] = 'T';
14111     unit_test_util::AddFeat (mrna, entry->SetSet().SetSeq_set().front());
14112     CRef<CSeq_entry> prot_seq = entry->SetSet().SetSeq_set().back();
14113     prot_seq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set()[4] = 'S';
14114     seh = scope.AddTopLevelSeqEntry(*entry);
14115     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNArange",
14116                       "mRNA overlaps or contains CDS but does not completely contain intervals"));
14117     eval = validator.Validate(seh, options);
14118     CheckErrors (*eval, expected_errors);
14119     CLEAR_ERRORS
14120 }
14121 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_OverlappingPeptideFeat)14122 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_OverlappingPeptideFeat)
14123 {
14124     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
14125     CRef<CSeq_entry> prot_seq = entry->SetSet().SetSeq_set().back();
14126     CRef<CSeq_feat> p1 = unit_test_util::AddMiscFeature (prot_seq, 4);
14127     p1->SetData().SetProt().SetProcessed(CProt_ref::eProcessed_signal_peptide);
14128     p1->SetData().SetProt().SetName().push_back("unnamed");
14129     CRef<CSeq_feat> p2 = unit_test_util::AddMiscFeature (prot_seq, 5);
14130     p2->SetData().SetProt().SetProcessed(CProt_ref::eProcessed_signal_peptide);
14131     p2->SetData().SetProt().SetName().push_back("unnamed");
14132 
14133     STANDARD_SETUP
14134 
14135     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "OverlappingPeptideFeat",
14136                       "Signal, Transit, or Mature peptide features overlap (parent CDS is on lcl|nuc)"));
14137     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "OverlappingPeptideFeat",
14138                       "Signal, Transit, or Mature peptide features overlap (parent CDS is on lcl|nuc)"));
14139     //AddChromosomeNoLocation(expected_errors, entry);
14140     eval = validator.Validate(seh, options);
14141     CheckErrors (*eval, expected_errors);
14142 
14143     CLEAR_ERRORS
14144 
14145     scope.RemoveTopLevelSeqEntry(seh);
14146     entry = unit_test_util::BuildGoodProtSeq();
14147     p1 = unit_test_util::AddMiscFeature (entry, 4);
14148     p1->SetData().SetProt().SetProcessed(CProt_ref::eProcessed_mature);
14149     p1->SetData().SetProt().SetName().push_back("unnamed");
14150     p2 = unit_test_util::AddMiscFeature (entry, 5);
14151     p2->SetData().SetProt().SetProcessed(CProt_ref::eProcessed_transit_peptide);
14152     p2->SetData().SetProt().SetName().push_back("unnamed");
14153     seh = scope.AddTopLevelSeqEntry(*entry);
14154     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OverlappingPeptideFeat",
14155                                    "Signal, Transit, or Mature peptide features overlap"));
14156     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OverlappingPeptideFeat",
14157                                    "Signal, Transit, or Mature peptide features overlap"));
14158     //AddChromosomeNoLocation(expected_errors, entry);
14159     eval = validator.Validate(seh, options);
14160     CheckErrors (*eval, expected_errors);
14161 
14162     CLEAR_ERRORS
14163 
14164     //no error if peptide exceptions
14165     p1->SetExcept(true);
14166     p1->SetExcept_text("alternative processing");
14167     //AddChromosomeNoLocation(expected_errors, entry);
14168     eval = validator.Validate(seh, options);
14169     CheckErrors (*eval, expected_errors);
14170 
14171     CLEAR_ERRORS
14172 }
14173 
14174 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SerialInComment)14175 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SerialInComment)
14176 {
14177     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
14178     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature (entry);
14179     misc->SetComment("blah blah [123456]");
14180 
14181     STANDARD_SETUP
14182 
14183     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "SerialInComment",
14184                       "Feature comment may refer to reference by serial number - attach reference specific comments to the reference REMARK instead."));
14185     //AddChromosomeNoLocation(expected_errors, entry);
14186     eval = validator.Validate(seh, options);
14187     CheckErrors (*eval, expected_errors);
14188 
14189     CLEAR_ERRORS
14190 }
14191 
14192 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleCDSproducts)14193 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleCDSproducts)
14194 {
14195     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
14196     CRef<CSeq_feat> cds2 = unit_test_util::AddMiscFeature (entry);
14197     cds2->SetData().SetCdregion();
14198     cds2->SetProduct().SetWhole().Assign(*(entry->SetSet().SetSeq_set().back()->SetSeq().SetId().front()));
14199     cds2->SetLocation().SetInt().SetFrom(30);
14200     cds2->SetLocation().SetInt().SetTo(56);
14201 
14202     STANDARD_SETUP
14203 
14204     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "MultipleCDSproducts",
14205                       "Same product Bioseq from multiple CDS features"));
14206     //AddChromosomeNoLocation(expected_errors, entry);
14207     eval = validator.Validate(seh, options);
14208     CheckErrors (*eval, expected_errors);
14209 
14210     CLEAR_ERRORS
14211 }
14212 
14213 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FocusOnBioSourceFeature)14214 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FocusOnBioSourceFeature)
14215 {
14216     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
14217     CRef<CSeq_feat> src = unit_test_util::AddGoodSourceFeature (entry);
14218     src->SetData().SetBiosrc().SetIs_focus();
14219     unit_test_util::SetFocus(entry);
14220 
14221     STANDARD_SETUP
14222 
14223     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "FocusOnBioSourceFeature",
14224                       "Focus must be on BioSource descriptor, not BioSource feature."));
14225     //AddChromosomeNoLocation(expected_errors, entry);
14226     eval = validator.Validate(seh, options);
14227     CheckErrors (*eval, expected_errors);
14228 
14229     CLEAR_ERRORS
14230 }
14231 
14232 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PeptideFeatOutOfFrame)14233 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PeptideFeatOutOfFrame)
14234 {
14235     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
14236     CRef<CSeq_entry> nuc_seq = entry->SetSet().SetSeq_set().front();
14237     CRef<CSeq_feat> peptide = unit_test_util::AddMiscFeature (nuc_seq, 6);
14238     peptide->SetData().SetImp().SetKey("sig_peptide");
14239 
14240     STANDARD_SETUP
14241 
14242     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PeptideFeatureLacksCDS",
14243                       "Peptide processing feature should be converted to the appropriate protein feature subtype"));
14244     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PeptideFeatOutOfFrame",
14245                       "Stop of sig_peptide is out of frame with CDS codons"));
14246     //AddChromosomeNoLocation(expected_errors, entry);
14247     eval = validator.Validate(seh, options);
14248     CheckErrors (*eval, expected_errors);
14249 
14250     scope.RemoveTopLevelSeqEntry(seh);
14251     entry = unit_test_util::BuildGoodNucProtSet();
14252     nuc_seq = entry->SetSet().SetSeq_set().front();
14253     peptide = unit_test_util::AddMiscFeature (nuc_seq, 5);
14254     peptide->SetLocation().SetInt().SetFrom(1);
14255     peptide->SetData().SetImp().SetKey("sig_peptide");
14256     seh = scope.AddTopLevelSeqEntry(*entry);
14257     expected_errors[1]->SetErrMsg("Start of sig_peptide is out of frame with CDS codons");
14258     eval = validator.Validate(seh, options);
14259     CheckErrors (*eval, expected_errors);
14260 
14261     scope.RemoveTopLevelSeqEntry(seh);
14262     entry = unit_test_util::BuildGoodNucProtSet();
14263     nuc_seq = entry->SetSet().SetSeq_set().front();
14264     peptide = unit_test_util::AddMiscFeature (nuc_seq, 6);
14265     peptide->SetLocation().SetInt().SetFrom(1);
14266     peptide->SetData().SetImp().SetKey("sig_peptide");
14267     seh = scope.AddTopLevelSeqEntry(*entry);
14268     expected_errors[1]->SetErrMsg("Start and stop of sig_peptide are out of frame with CDS codons");
14269     eval = validator.Validate(seh, options);
14270     CheckErrors (*eval, expected_errors);
14271 
14272     CLEAR_ERRORS
14273 }
14274 
14275 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSgeneRange)14276 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSgeneRange)
14277 {
14278     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
14279     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
14280     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature (cds);
14281     gene->SetLocation().SetInt().SetFrom(1);
14282     unit_test_util::AddFeat (gene, entry->SetSet().SetSeq_set().front());
14283 
14284     STANDARD_SETUP
14285 
14286     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSgeneRange",
14287                       "gene [gene locus:lcl|nuc:2-27] overlaps CDS but does not completely contain it"));
14288     //AddChromosomeNoLocation(expected_errors, entry);
14289     eval = validator.Validate(seh, options);
14290     CheckErrors (*eval, expected_errors);
14291 
14292     CLEAR_ERRORS
14293 
14294     scope.RemoveTopLevelSeqEntry(seh);
14295     gene->SetId().SetLocal().SetId(1);
14296     cds->SetId().SetLocal().SetId(2);
14297     CRef<CSeqFeatXref> gene_xref(new CSeqFeatXref());
14298     gene_xref->SetId().SetLocal().SetId(1);
14299     cds->SetXref().push_back(gene_xref);
14300     CRef<CSeqFeatXref> cds_xref(new CSeqFeatXref());
14301     cds_xref->SetId().SetLocal().SetId(2);
14302     gene->SetXref().push_back(cds_xref);
14303 
14304     seh = scope.AddTopLevelSeqEntry(*entry);
14305 
14306     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSgeneRange",
14307                       "gene [gene locus:lcl|nuc:2-27] overlaps CDS but does not completely contain it"));
14308     //AddChromosomeNoLocation(expected_errors, entry);
14309     eval = validator.Validate(seh, options);
14310     CheckErrors (*eval, expected_errors);
14311 
14312     CLEAR_ERRORS
14313 
14314     // for VR-821
14315     scope.RemoveTopLevelSeqEntry(seh);
14316     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
14317     CRef<CSeq_entry> prot = unit_test_util::GetProteinSequenceFromGoodNucProtSet(entry);
14318 
14319     CRef<CSeq_loc> cl1(new CSeq_loc());
14320     cl1->SetInt().SetFrom(0);
14321     cl1->SetInt().SetTo(8);
14322     cl1->SetInt().SetId().Assign(*(nuc->GetSeq().GetId().front()));
14323     CRef<CSeq_loc> cl2(new CSeq_loc());
14324     cl2->SetInt().SetFrom(21);
14325     cl2->SetInt().SetTo(26);
14326     cl2->SetInt().SetId().Assign(*(nuc->GetSeq().GetId().front()));
14327     CRef<CSeq_loc> gl1(new CSeq_loc());
14328     gl1->Assign(*cl2);
14329     CRef<CSeq_loc> gl2(new CSeq_loc());
14330     gl2->Assign(*cl1);
14331 
14332     cds->SetLocation().SetMix().Set().push_back(cl1);
14333     cds->SetLocation().SetMix().Set().push_back(cl2);
14334 
14335     gene->SetLocation().SetMix().Set().push_back(gl1);
14336     gene->SetLocation().SetMix().Set().push_back(gl2);
14337 
14338     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAGTAACAGAGAAGAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
14339 
14340     prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRN");
14341     prot->SetSeq().SetInst().SetLength(4);
14342     CRef<CSeq_feat> prot_feat = unit_test_util::GetProtFeatFromGoodNucProtSet(entry);
14343     prot_feat->SetLocation().SetInt().SetTo(3);
14344 
14345     seh = scope.AddTopLevelSeqEntry(*entry);
14346 
14347     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning,
14348         "MultiIntervalGene", "Gene feature on non-segmented sequence should not have multiple intervals"));
14349     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning,
14350         "CDSgeneRange", "gene [gene locus:[lcl|nuc:22-27, 1-9]] overlaps CDS but does not completely contain it"));
14351     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error,
14352         "SeqLocOrder", "Location: Intervals out of order in SeqLoc [(lcl|nuc:22-27, 1-9)]"));
14353     //AddChromosomeNoLocation(expected_errors, entry);
14354     eval = validator.Validate(seh, options);
14355     CheckErrors (*eval, expected_errors);
14356 
14357     CLEAR_ERRORS
14358 
14359     // no CDSGeneRange error if trans-spliced
14360     cds->SetExcept(true);
14361     cds->SetExcept_text("trans-splicing");
14362 
14363     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning,
14364         "MultiIntervalGene", "Gene feature on non-segmented sequence should not have multiple intervals"));
14365     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error,
14366         "SeqLocOrder", "Location: Intervals out of order in SeqLoc [(lcl|nuc:22-27, 1-9)]"));
14367     //AddChromosomeNoLocation(expected_errors, entry);
14368     eval = validator.Validate(seh, options);
14369     CheckErrors (*eval, expected_errors);
14370 
14371     CLEAR_ERRORS
14372 }
14373 
14374 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleMRNAproducts)14375 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleMRNAproducts)
14376 {
14377     CRef<CSeq_entry> entry = unit_test_util::BuildGoodGenProdSet();
14378     CRef<CSeq_entry> contig = entry->SetSet().SetSeq_set().front();
14379     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(contig);
14380     feat->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
14381     feat->SetData().SetRna().SetExt().SetName("fake protein name");
14382     feat->SetProduct().SetWhole().SetLocal().SetStr("nuc");
14383 
14384     STANDARD_SETUP
14385 
14386     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatureProductInconsistency",
14387                       "mRNA products are not unique"));
14388     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
14389                        "No CDS location match for 1 mRNA"));
14390     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TranscriptLen",
14391                       "Transcript length [11] less than product length [27], and tail < 95% polyA"));
14392     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "IdenticalMRNAtranscriptIDs",
14393                       "Identical transcript IDs found on multiple mRNAs"));
14394     //AddChromosomeNoLocation(expected_errors, entry);
14395     eval = validator.Validate(seh, options);
14396     CheckErrors (*eval, expected_errors);
14397 
14398     CLEAR_ERRORS
14399 }
14400 
14401 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_mRNAgeneRange)14402 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_mRNAgeneRange)
14403 {
14404     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
14405     CRef<CSeq_feat> gene = unit_test_util::AddMiscFeature(entry);
14406     gene->SetData().SetGene().SetLocus("locus");
14407     gene->SetLocation().SetInt().SetFrom(5);
14408     CRef<CSeq_feat> mrna = unit_test_util::AddMiscFeature(entry);
14409     mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
14410     mrna->SetLocation().SetInt().SetTo(10);
14411 
14412     STANDARD_SETUP
14413 
14414     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "mRNAgeneRange",
14415                       "gene [locus:lcl|good:6-11] overlaps mRNA but does not completely contain it"));
14416     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
14417                        "No CDS location match for 1 mRNA"));
14418     //AddChromosomeNoLocation(expected_errors, entry);
14419     eval = validator.Validate(seh, options);
14420     CheckErrors (*eval, expected_errors);
14421 
14422     CLEAR_ERRORS
14423 
14424     // if there is an overlapping gene or operon, error is suppressed
14425     scope.RemoveTopLevelSeqEntry(seh);
14426     CRef<CSeq_feat> overlap = unit_test_util::AddMiscFeature(entry);
14427     overlap->SetData().SetGene().SetLocus("locus2");
14428     overlap->SetLocation().SetInt().SetTo(10);
14429     seh = scope.AddTopLevelSeqEntry(*entry);
14430     eval = validator.Validate(seh, options);
14431     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
14432                        "No CDS location match for 1 mRNA"));
14433     //AddChromosomeNoLocation(expected_errors, entry);
14434     CheckErrors (*eval, expected_errors);
14435 
14436     scope.RemoveTopLevelSeqEntry(seh);
14437     overlap->SetData().SetImp().SetKey("operon");
14438     overlap->AddQualifier ("operon", "operon name");
14439     seh = scope.AddTopLevelSeqEntry(*entry);
14440     eval = validator.Validate(seh, options);
14441     CheckErrors (*eval, expected_errors);
14442 
14443     CLEAR_ERRORS
14444 }
14445 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TranscriptLen)14446 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TranscriptLen)
14447 {
14448     CRef<CSeq_entry> entry = unit_test_util::BuildGoodGenProdSet();
14449     CRef<CSeq_entry> contig = entry->SetSet().SetSeq_set().front();
14450     CRef<CSeq_feat> mrna = contig->SetSeq().SetAnnot().front()->SetData().SetFtable().back();
14451     mrna->SetLocation().SetInt().SetTo(10);
14452 
14453     STANDARD_SETUP
14454     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSwithNoMRNA",
14455                               "Unmatched CDS"));
14456     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
14457                       "No CDS location match for 1 mRNA"));
14458     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNArange",
14459                       "mRNA overlaps or contains CDS but does not completely contain intervals"));
14460     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TranscriptLen",
14461                       "Transcript length [11] less than product length [27], and tail < 95% polyA"));
14462     //AddChromosomeNoLocation(expected_errors, entry);
14463     eval = validator.Validate(seh, options);
14464     CheckErrors (*eval, expected_errors);
14465     // allow for polyA tail
14466     scope.RemoveTopLevelSeqEntry(seh);
14467     mrna->SetLocation().SetInt().SetTo(25);
14468     seh = scope.AddTopLevelSeqEntry(*entry);
14469     expected_errors[3]->SetErrCode ("PolyATail");
14470     expected_errors[3]->SetSeverity(eDiag_Info);
14471     expected_errors[3]->SetErrMsg ("Transcript length [26] less than product length [27], but tail is 100% polyA");
14472     eval = validator.Validate(seh, options);
14473     CheckErrors (*eval, expected_errors);
14474 
14475     CLEAR_ERRORS
14476 
14477     scope.RemoveTopLevelSeqEntry(seh);
14478     mrna->SetLocation().SetInt().SetTo(37);
14479     seh = scope.AddTopLevelSeqEntry(*entry);
14480     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TranscriptLen",
14481                       "Transcript length [38] greater than product length [27]"));
14482     //AddChromosomeNoLocation(expected_errors, entry);
14483     eval = validator.Validate(seh, options);
14484     CheckErrors (*eval, expected_errors);
14485     CLEAR_ERRORS
14486 }
14487 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TranscriptMismatches)14488 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TranscriptMismatches)
14489 {
14490     CRef<CSeq_entry> entry = unit_test_util::BuildGoodGenProdSet();
14491     CRef<CSeq_entry> np = unit_test_util::GetNucProtSetFromGenProdSet(entry);
14492     CRef<CSeq_entry> mrna_seq = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(np);
14493     mrna_seq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATAAATTAA");
14494 
14495     STANDARD_SETUP
14496 
14497     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "TranscriptMismatches",
14498                       "There are 1 mismatches out of 27 bases between the transcript and product sequence"));
14499     //AddChromosomeNoLocation(expected_errors, entry);
14500     eval = validator.Validate(seh, options);
14501     CheckErrors (*eval, expected_errors);
14502 
14503     CLEAR_ERRORS
14504 
14505     // suppress error if exception
14506     scope.RemoveTopLevelSeqEntry(seh);
14507     CRef<CSeq_entry> contig = entry->SetSet().SetSeq_set().front();
14508     CRef<CSeq_feat> mrna = contig->SetSeq().SetAnnot().front()->SetData().SetFtable().back();
14509     mrna->SetExcept(true);
14510     mrna->SetExcept_text ("mismatches in transcription");
14511     seh = scope.AddTopLevelSeqEntry(*entry);
14512     //AddChromosomeNoLocation(expected_errors, entry);
14513     eval = validator.Validate(seh, options);
14514     CheckErrors (*eval, expected_errors);
14515 
14516     CLEAR_ERRORS
14517 }
14518 
14519 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSproductPackagingProblem)14520 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSproductPackagingProblem)
14521 {
14522     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
14523     entry->SetSet().SetClass (CBioseq_set::eClass_eco_set);
14524 
14525     STANDARD_SETUP
14526 
14527     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "CDSproductPackagingProblem",
14528                       "Protein product not packaged in nuc-prot set with nucleotide"));
14529     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "FeaturePackagingProblem",
14530         "There is 1 mispackaged feature in this record."));
14531     //AddChromosomeNoLocation(expected_errors, "lcl|nuc");
14532     eval = validator.Validate(seh, options);
14533     CheckErrors (*eval, expected_errors);
14534 
14535     CLEAR_ERRORS
14536 }
14537 
14538 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateInterval)14539 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateInterval)
14540 {
14541     // error for duplicate in tRNA anticodon location
14542     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
14543     CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
14544     CRef<CSeq_loc> anticodon_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
14545     anticodon_loc->SetMix().Set().front()->SetInt().SetFrom(8);
14546     anticodon_loc->SetMix().Set().front()->SetInt().SetTo(10);
14547     anticodon_loc->SetMix().Set().back()->SetInt().SetFrom(8);
14548     anticodon_loc->SetMix().Set().back()->SetInt().SetTo(10);
14549     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().Assign(*anticodon_loc);
14550     unit_test_util::AddFeat (trna, entry);
14551 
14552     STANDARD_SETUP
14553 
14554     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "tRNArange",
14555                       "Anticodon is not 3 bases in length"));
14556     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "DuplicateAnticodonInterval",
14557                       "Duplicate anticodon exons in location"));
14558     //AddChromosomeNoLocation(expected_errors, entry);
14559     eval = validator.Validate(seh, options);
14560     CheckErrors (*eval, expected_errors);
14561 
14562     CLEAR_ERRORS
14563 
14564     // different error for feature location
14565     scope.RemoveTopLevelSeqEntry(seh);
14566     entry = unit_test_util::BuildGoodSeq();
14567     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
14568     CRef<CSeq_loc> loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
14569     loc->SetMix().Set().back()->SetInt().SetFrom(0);
14570     loc->SetMix().Set().back()->SetInt().SetTo(15);
14571     feat->SetLocation().Assign(*loc);
14572 
14573     seh = scope.AddTopLevelSeqEntry(*entry);
14574     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "DuplicateExonInterval",
14575                       "Duplicate exons in location"));
14576     //AddChromosomeNoLocation(expected_errors, entry);
14577     eval = validator.Validate(seh, options);
14578     CheckErrors (*eval, expected_errors);
14579 
14580 
14581     CLEAR_ERRORS
14582 }
14583 
14584 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PolyAsiteNotPoint)14585 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PolyAsiteNotPoint)
14586 {
14587     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
14588     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
14589     feat->SetData().SetImp().SetKey("polyA_site");
14590 
14591     STANDARD_SETUP
14592 
14593     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "PolyAsiteNotPoint",
14594                       "PolyA_site should be a single point"));
14595     //AddChromosomeNoLocation(expected_errors, entry);
14596     eval = validator.Validate(seh, options);
14597     CheckErrors (*eval, expected_errors);
14598 
14599     CLEAR_ERRORS
14600     // error should go away if feature location is single point
14601     feat->SetLocation().SetPnt().SetId().SetLocal().SetStr("good");
14602     feat->SetLocation().SetPnt().SetPoint(5);
14603 
14604     //AddChromosomeNoLocation(expected_errors, entry);
14605     eval = validator.Validate(seh, options);
14606     CheckErrors (*eval, expected_errors);
14607 
14608     CLEAR_ERRORS
14609 }
14610 
14611 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ImpFeatBadLoc)14612 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ImpFeatBadLoc)
14613 {
14614     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
14615     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
14616     feat->SetData().SetImp().SetLoc("one-of three");
14617 
14618     STANDARD_SETUP
14619 
14620     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ImpFeatBadLoc",
14621                       "ImpFeat loc one-of three has obsolete 'one-of' text for feature misc_feature"));
14622     //AddChromosomeNoLocation(expected_errors, entry);
14623     eval = validator.Validate(seh, options);
14624     CheckErrors (*eval, expected_errors);
14625 
14626     feat->SetData().SetImp().SetLoc("5..12");
14627     expected_errors[0]->SetErrMsg("ImpFeat loc 5..12 does not equal feature location 1..11 for feature misc_feature");
14628     eval = validator.Validate(seh, options);
14629     CheckErrors (*eval, expected_errors);
14630 
14631     CLEAR_ERRORS
14632 }
14633 
14634 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnnecessaryCitPubEquiv)14635 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnnecessaryCitPubEquiv)
14636 {
14637     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
14638     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
14639     CRef<CPub> pub(new CPub());
14640     pub->SetPmid(CPub::TPmid(ENTREZ_ID_CONST(1)));
14641     feat->SetCit().SetPub().push_back(pub);
14642     CRef<CPub> pub2(new CPub());
14643     pub2->SetEquiv();
14644     feat->SetCit().SetPub().push_back(pub2);
14645     STANDARD_SETUP
14646 
14647     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnnecessaryCitPubEquiv",
14648                       "Citation on feature has unexpected internal Pub-equiv"));
14649     //AddChromosomeNoLocation(expected_errors, entry);
14650     eval = validator.Validate(seh, options);
14651     CheckErrors (*eval, expected_errors);
14652 
14653     CLEAR_ERRORS
14654 }
14655 
14656 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ImpCDShasTranslation)14657 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ImpCDShasTranslation)
14658 {
14659     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
14660     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
14661     feat->SetPseudo(true);
14662     feat->SetData().SetImp().SetKey("CDS");
14663     feat->AddQualifier("translation", "unexpected translation");
14664 
14665     STANDARD_SETUP
14666 
14667     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ImpCDShasTranslation",
14668                       "ImpFeat CDS with /translation found"));
14669     //AddChromosomeNoLocation(expected_errors, entry);
14670     eval = validator.Validate(seh, options);
14671     CheckErrors (*eval, expected_errors);
14672 
14673     CLEAR_ERRORS
14674 
14675 }
14676 
14677 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ImpCDSnotPseudo)14678 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ImpCDSnotPseudo)
14679 {
14680     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
14681     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
14682     feat->SetData().SetImp().SetKey("CDS");
14683 
14684     STANDARD_SETUP
14685 
14686     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "ImpCDSnotPseudo",
14687                       "ImpFeat CDS should be pseudo"));
14688     //AddChromosomeNoLocation(expected_errors, entry);
14689     eval = validator.Validate(seh, options);
14690     CheckErrors (*eval, expected_errors);
14691 
14692     CLEAR_ERRORS
14693 
14694     // overlapping pseudogene should suppress
14695     scope.RemoveTopLevelSeqEntry(seh);
14696     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature(feat);
14697     gene->SetPseudo (true);
14698     unit_test_util::AddFeat (gene, entry);
14699     seh = scope.AddTopLevelSeqEntry(*entry);
14700 
14701     //AddChromosomeNoLocation(expected_errors, entry);
14702     eval = validator.Validate(seh, options);
14703     CheckErrors (*eval, expected_errors);
14704 
14705     CLEAR_ERRORS
14706 }
14707 
14708 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_FEAT_MissingMRNAproduct,CGenBankFixture)14709 BOOST_FIXTURE_TEST_CASE(Test_SEQ_FEAT_MissingMRNAproduct, CGenBankFixture)
14710 {
14711     CRef<CSeq_entry> entry = unit_test_util::BuildGoodGenProdSet();
14712     CRef<CSeq_entry> contig = entry->SetSet().SetSeq_set().front();
14713     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(contig);
14714     feat->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
14715     feat->SetData().SetRna().SetExt().SetName("fake protein name");
14716     feat->SetProduct().SetWhole().SetLocal().SetStr("not_present_ever");
14717 
14718     STANDARD_SETUP
14719 
14720     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
14721                       "No CDS location match for 1 mRNA"));
14722     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "ProductFetchFailure",
14723                       "Unable to fetch mRNA transcript 'lcl|not_present_ever'"));
14724     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingMRNAproduct",
14725                       "Product Bioseq of mRNA feature is not packaged in the record"));
14726     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GenomicProductPackagingProblem",
14727                       "Product of mRNA feature (lcl|not_present_ever) not packaged in genomic product set"));
14728     //AddChromosomeNoLocation(expected_errors, entry);
14729     eval = validator.Validate(seh, options);
14730     CheckErrors (*eval, expected_errors);
14731 
14732     CLEAR_ERRORS
14733 }
14734 
14735 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_AbuttingIntervals)14736 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_AbuttingIntervals)
14737 {
14738     // error for abutting tRNA anticodon location
14739     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
14740     CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
14741     CRef<CSeq_loc> anticodon_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
14742     anticodon_loc->SetMix().Set().front()->SetInt().SetFrom(8);
14743     anticodon_loc->SetMix().Set().front()->SetInt().SetTo(8);
14744     anticodon_loc->SetMix().Set().back()->SetInt().SetFrom(9);
14745     anticodon_loc->SetMix().Set().back()->SetInt().SetTo(10);
14746     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().Assign(*anticodon_loc);
14747     trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('F');
14748     unit_test_util::AddFeat (trna, entry);
14749 
14750     STANDARD_SETUP
14751 
14752     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "AbuttingIntervals",
14753                       "Adjacent intervals in Anticodon"));
14754     //AddChromosomeNoLocation(expected_errors, entry);
14755     eval = validator.Validate(seh, options);
14756     CheckErrors (*eval, expected_errors);
14757 
14758     CLEAR_ERRORS
14759 
14760     // different error for feature location
14761     scope.RemoveTopLevelSeqEntry(seh);
14762     entry = unit_test_util::BuildGoodSeq();
14763     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
14764     CRef<CSeq_loc> loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
14765     loc->SetMix().Set().front()->SetInt().SetFrom(0);
14766     loc->SetMix().Set().front()->SetInt().SetTo(7);
14767     loc->SetMix().Set().back()->SetInt().SetFrom(8);
14768     loc->SetMix().Set().back()->SetInt().SetTo(15);
14769     feat->SetLocation().Assign(*loc);
14770 
14771     seh = scope.AddTopLevelSeqEntry(*entry);
14772     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "AbuttingIntervals",
14773       "Location: Adjacent intervals in SeqLoc [(lcl|good:1-8, 9-16)]"));
14774     //AddChromosomeNoLocation(expected_errors, entry);
14775     eval = validator.Validate(seh, options);
14776     CheckErrors (*eval, expected_errors);
14777 
14778 
14779     CLEAR_ERRORS
14780 }
14781 
14782 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CollidingGeneNames)14783 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CollidingGeneNames)
14784 {
14785     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
14786     CRef<CSeq_feat> gene1 = unit_test_util::AddMiscFeature(entry);
14787     gene1->SetLocation().SetInt().SetFrom(0);
14788     gene1->SetLocation().SetInt().SetTo(7);
14789     gene1->SetData().SetGene().SetLocus("see_it_twice");
14790 
14791     CRef<CSeq_feat> gene2 = unit_test_util::AddMiscFeature(entry);
14792     gene2->SetLocation().SetInt().SetFrom(15);
14793     gene2->SetLocation().SetInt().SetTo(20);
14794     gene2->SetData().SetGene().SetLocus("see_it_twice");
14795 
14796     STANDARD_SETUP
14797     // used to produce an error, removed per VR-811
14798     //AddChromosomeNoLocation(expected_errors, entry);
14799     eval = validator.Validate(seh, options);
14800     CheckErrors (*eval, expected_errors);
14801 
14802     scope.RemoveTopLevelSeqEntry(seh);
14803     gene2->SetData().SetGene().SetLocus("See_It_Twice");
14804     seh = scope.AddTopLevelSeqEntry(*entry);
14805     eval = validator.Validate(seh, options);
14806     CheckErrors (*eval, expected_errors);
14807 
14808     CLEAR_ERRORS
14809 
14810     scope.RemoveTopLevelSeqEntry(seh);
14811     gene2->SetLocation().SetInt().SetFrom(0);
14812     gene2->SetLocation().SetInt().SetTo(7);
14813     seh = scope.AddTopLevelSeqEntry(*entry);
14814     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "DuplicateFeat",
14815            "Features have identical intervals, but labels differ"));
14816     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "DuplicateGeneConflictingLocusTag",
14817           "Colliding names (with different capitalization) in gene features, but feature locations are identical"));
14818     //AddChromosomeNoLocation(expected_errors, entry);
14819     eval = validator.Validate(seh, options);
14820     CheckErrors (*eval, expected_errors);
14821 
14822     CLEAR_ERRORS
14823 
14824     scope.RemoveTopLevelSeqEntry(seh);
14825     // this situation used to produce an error, removed VR-801
14826     gene2->SetLocation().SetInt().SetFrom(10);
14827     gene2->SetLocation().SetInt().SetTo(17);
14828     seh = scope.AddTopLevelSeqEntry(*entry);
14829     //AddChromosomeNoLocation(expected_errors, entry);
14830     eval = validator.Validate(seh, options);
14831     CheckErrors (*eval, expected_errors);
14832 
14833     CLEAR_ERRORS
14834 }
14835 
14836 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultiIntervalGene)14837 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultiIntervalGene)
14838 {
14839     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
14840     CRef<CSeq_feat> gene = unit_test_util::AddMiscFeature(entry);
14841     gene->SetData().SetGene().SetLocus("multi-interval");
14842     CRef<CSeq_loc> loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
14843     gene->SetLocation().Assign (*loc);
14844 
14845     STANDARD_SETUP
14846     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MultiIntervalGene",
14847           "Gene feature on non-segmented sequence should not have multiple intervals"));
14848     //AddChromosomeNoLocation(expected_errors, entry);
14849 
14850     eval = validator.Validate(seh, options);
14851     CheckErrors (*eval, expected_errors);
14852 
14853     CLEAR_ERRORS
14854 }
14855 
14856 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatContentDup)14857 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatContentDup)
14858 {
14859     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
14860     CRef<CSeq_feat> feat1 = unit_test_util::AddMiscFeature(entry);
14861     CRef<CSeq_feat> feat2 = unit_test_util::AddMiscFeature(entry);
14862 
14863     STANDARD_SETUP
14864     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatContentDup",
14865       "Duplicate feature"));
14866     //AddChromosomeNoLocation(expected_errors, entry);
14867     eval = validator.Validate(seh, options);
14868     CheckErrors (*eval, expected_errors);
14869     set< CSeq_feat_Handle > dups = validator::GetDuplicateFeaturesForRemoval(seh);
14870     BOOST_CHECK_EQUAL(dups.size(), 1);
14871 
14872 
14873     // many suppression conditions
14874     // region
14875     scope.RemoveTopLevelSeqEntry(seh);
14876     feat1->SetData().SetRegion("region");
14877     feat2->SetData().SetRegion("region");
14878     seh = scope.AddTopLevelSeqEntry(*entry);
14879     eval = validator.Validate(seh, options);
14880     CheckErrors (*eval, expected_errors);
14881 
14882     dups = validator::GetDuplicateFeaturesForRemoval(seh);
14883     BOOST_CHECK_EQUAL(dups.size(), 1);
14884 
14885     CLEAR_ERRORS
14886     //suppress if different dbxrefs
14887     scope.RemoveTopLevelSeqEntry(seh);
14888     unit_test_util::SetDbxref (feat1, "ASAP", "first");
14889     unit_test_util::SetDbxref (feat2, "ASAP", "second");
14890     seh = scope.AddTopLevelSeqEntry(*entry);
14891     //AddChromosomeNoLocation(expected_errors, entry);
14892     eval = validator.Validate(seh, options);
14893     CheckErrors (*eval, expected_errors);
14894 
14895     dups = validator::GetDuplicateFeaturesForRemoval(seh);
14896     BOOST_CHECK_EQUAL(dups.size(), 0);
14897 
14898     // variation
14899     scope.RemoveTopLevelSeqEntry(seh);
14900     feat1->SetData().SetImp().SetKey("variation");
14901     feat2->SetData().SetImp().SetKey("variation");
14902     seh = scope.AddTopLevelSeqEntry(*entry);
14903     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatContentDup",
14904       "Duplicate feature"));
14905     eval = validator.Validate(seh, options);
14906     CheckErrors (*eval, expected_errors);
14907 
14908     dups = validator::GetDuplicateFeaturesForRemoval(seh);
14909     BOOST_CHECK_EQUAL(dups.size(), 1);
14910 
14911     CLEAR_ERRORS
14912     // suppress if different replace qualifiers
14913     scope.RemoveTopLevelSeqEntry(seh);
14914     feat1->AddQualifier("replace", "a");
14915     feat2->AddQualifier("replace", "t");
14916     seh = scope.AddTopLevelSeqEntry(*entry);
14917     //AddChromosomeNoLocation(expected_errors, entry);
14918     eval = validator.Validate(seh, options);
14919     CheckErrors (*eval, expected_errors);
14920 
14921     CLEAR_ERRORS
14922     dups = validator::GetDuplicateFeaturesForRemoval(seh);
14923     BOOST_CHECK_EQUAL(dups.size(), 0);
14924 
14925     // coding regions/mRNAs with different links
14926     scope.RemoveTopLevelSeqEntry(seh);
14927     entry = unit_test_util::BuildGoodNucProtSet();
14928     CRef<CSeq_feat> cds1 = unit_test_util::GetCDSFromGoodNucProtSet(entry);
14929     CRef<CSeq_feat> cds2 = unit_test_util::MakeCDSForGoodNucProtSet("nuc", "prot2");
14930     unit_test_util::AddFeat (cds2, entry);
14931     CRef<CSeq_entry> pentry = unit_test_util::MakeProteinForGoodNucProtSet("prot2");
14932     entry->SetSet().SetSeq_set().push_back(pentry);
14933     CRef<CSeq_entry> nentry = entry->SetSet().SetSeq_set().front();
14934     CRef<CSeq_feat> mrna1 = unit_test_util::MakeCDSForGoodNucProtSet("nuc", "prot1");
14935     mrna1->ResetProduct();
14936     mrna1->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
14937     unit_test_util::AddFeat (mrna1, nentry);
14938     CRef<CSeq_feat> mrna2 = unit_test_util::MakeCDSForGoodNucProtSet("nuc", "prot1");
14939     mrna2->ResetProduct();
14940     mrna2->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
14941     unit_test_util::AddFeat (mrna2, nentry);
14942     seh = scope.AddTopLevelSeqEntry(*entry);
14943 
14944     // two duplicate feature errors, one for cds, one for mRNA
14945 //    expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "CDSwithMultipleMRNAs",
14946 //      "CDS overlapped by 2 mRNAs, but product locations are unique"));
14947 //    expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "CDSwithMultipleMRNAs",
14948 //      "CDS overlapped by 2 mRNAs, but product locations are unique"));
14949     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "FeatContentDup",
14950       "Duplicate feature"));
14951     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "FeatContentDup",
14952       "Duplicate feature"));
14953     //AddChromosomeNoLocation(expected_errors, entry);
14954 
14955     eval = validator.Validate(seh, options);
14956     CheckErrors (*eval, expected_errors);
14957 
14958     dups = validator::GetDuplicateFeaturesForRemoval(seh);
14959     BOOST_CHECK_EQUAL(dups.size(), 2);
14960 
14961     // suppress errors if cdss and mrnas are linked AND mRNAs have different locations
14962     CLEAR_ERRORS
14963     scope.RemoveTopLevelSeqEntry(seh);
14964     cds1->SetId().SetLocal().SetId(1);
14965     cds2->SetId().SetLocal().SetId(2);
14966     mrna1->SetId().SetLocal().SetId(3);
14967     mrna2->SetId().SetLocal().SetId(4);
14968     cds1->AddSeqFeatXref(mrna1->GetId());
14969     cds2->AddSeqFeatXref(mrna2->GetId());
14970     mrna1->AddSeqFeatXref(cds1->GetId());
14971     mrna2->AddSeqFeatXref(cds2->GetId());
14972     mrna2->SetLocation().SetInt().SetTo(mrna2->GetLocation().GetInt().GetTo() + 10);
14973     seh = scope.AddTopLevelSeqEntry(*entry);
14974     //AddChromosomeNoLocation(expected_errors, entry);
14975     eval = validator.Validate(seh, options);
14976     CheckErrors (*eval, expected_errors);
14977 
14978     dups = validator::GetDuplicateFeaturesForRemoval(seh);
14979     BOOST_CHECK_EQUAL(dups.size(), 0);
14980 
14981     CLEAR_ERRORS
14982 }
14983 
14984 
ChangeGoodNucProtSetIdToGenbankName(CRef<CSeq_entry> entry,string name)14985 static void ChangeGoodNucProtSetIdToGenbankName (CRef<CSeq_entry> entry, string name)
14986 {
14987     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
14988     CRef<CSeq_entry> nuc_seq = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet (entry);
14989     CRef<CSeq_entry> prot_seq = unit_test_util::GetProteinSequenceFromGoodNucProtSet (entry);
14990     CRef<CSeq_feat> prot_feat = unit_test_util::GetProtFeatFromGoodNucProtSet (entry);
14991 
14992     cds->SetProduct().SetWhole().SetGenbank().SetName(name);
14993     prot_seq->SetSeq().SetId().front()->SetGenbank().SetName(name);
14994     prot_feat->SetLocation().SetInt().SetId().SetGenbank().SetName(name);
14995 }
14996 
14997 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadProductSeqId)14998 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadProductSeqId)
14999 {
15000     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
15001     // try one that looks like a valid ID
15002     ChangeGoodNucProtSetIdToGenbankName(entry, "AY123456");
15003 
15004     STANDARD_SETUP
15005     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadProductSeqId",
15006               "Feature product should not put an accession in the Textseq-id 'name' slot"));
15007     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadProductSeqId",
15008               "Protein bioseq has Textseq-id 'name' that looks like it is derived from a nucleotide accession"));
15009     //AddChromosomeNoLocation(expected_errors, entry);
15010     eval = validator.Validate(seh, options);
15011     CheckErrors (*eval, expected_errors);
15012 
15013     CLEAR_ERRORS
15014     // try one that looks like a local ID
15015     scope.RemoveTopLevelSeqEntry(seh);
15016     ChangeGoodNucProtSetIdToGenbankName(entry, "lcl|prot");
15017     seh = scope.AddTopLevelSeqEntry(*entry);
15018     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadProductSeqId",
15019               "Feature product should not use Textseq-id 'name' slot"));
15020     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadProductSeqId",
15021               "Protein bioseq has Textseq-id 'name' and no accession"));
15022     //AddChromosomeNoLocation(expected_errors, entry);
15023     eval = validator.Validate(seh, options);
15024     CheckErrors (*eval, expected_errors);
15025 
15026     CLEAR_ERRORS
15027 
15028     // change capitalization
15029     scope.RemoveTopLevelSeqEntry(seh);
15030     entry = BuildGoodNucProtSet();
15031     CRef<CSeq_feat> cds = GetCDSFromGoodNucProtSet(entry);
15032     NStr::ToUpper(cds->SetProduct().SetWhole().SetLocal().SetStr());
15033     seh = scope.AddTopLevelSeqEntry(*entry);
15034     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "BadProductSeqId",
15035         "Capitalization change from product location on feature to product sequence"));
15036     //AddChromosomeNoLocation(expected_errors, entry);
15037     eval = validator.Validate(seh, options);
15038     CheckErrors(*eval, expected_errors);
15039 
15040     CLEAR_ERRORS
15041 }
15042 
15043 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RnaProductMismatch)15044 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RnaProductMismatch)
15045 {
15046     CRef<CSeq_entry> nuc = unit_test_util::BuildGoodSeq();
15047     CRef<CSeq_feat> rna_feat = unit_test_util::AddMiscFeature (nuc);
15048     rna_feat->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
15049     rna_feat->SetLocation().SetInt().SetTo(59);
15050     rna_feat->SetProduct().SetWhole().SetLocal().SetStr("rna");
15051 
15052     CRef<CSeq_entry> rna_seq = unit_test_util::BuildGoodSeq();
15053     rna_seq->SetSeq().SetId().front()->SetLocal().SetStr("rna");
15054 
15055     CRef<CSeq_entry> entry(new CSeq_entry());
15056     entry->SetSet().SetClass(CBioseq_set::eClass_gen_prod_set);
15057     entry->SetSet().SetSeq_set().push_back (nuc);
15058     entry->SetSet().SetSeq_set().push_back (rna_seq);
15059 
15060     STANDARD_SETUP
15061 
15062     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
15063                       "No CDS location match for 1 mRNA"));
15064     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "RnaProductMismatch",
15065               "Type of RNA does not match MolInfo of product Bioseq"));
15066     //AddChromosomeNoLocation(expected_errors, entry);
15067     eval = validator.Validate(seh, options);
15068     CheckErrors (*eval, expected_errors);
15069 
15070     // make error go away
15071     CLEAR_ERRORS
15072     scope.RemoveTopLevelSeqEntry(seh);
15073     rna_seq->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
15074     unit_test_util::SetBiomol (rna_seq, CMolInfo::eBiomol_mRNA);
15075     seh = scope.AddTopLevelSeqEntry(*entry);
15076     eval = validator.Validate(seh, options);
15077     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
15078                       "No CDS location match for 1 mRNA"));
15079     //AddChromosomeNoLocation(expected_errors, entry);
15080     CheckErrors (*eval, expected_errors);
15081 
15082     CLEAR_ERRORS
15083     // also get errors for tRNA
15084     scope.RemoveTopLevelSeqEntry(seh);
15085     rna_feat->SetData().SetRna().SetType(CRNA_ref::eType_tRNA);
15086     rna_feat->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('N');
15087     rna_feat->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetId().SetLocal().SetStr("good");
15088     rna_feat->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(11);
15089     rna_feat->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(13);
15090     seh = scope.AddTopLevelSeqEntry(*entry);
15091     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "RnaProductMismatch",
15092               "Type of RNA does not match MolInfo of product Bioseq"));
15093     //AddChromosomeNoLocation(expected_errors, entry);
15094     eval = validator.Validate(seh, options);
15095     CheckErrors (*eval, expected_errors);
15096 
15097     // make error go away
15098     CLEAR_ERRORS
15099     scope.RemoveTopLevelSeqEntry(seh);
15100     unit_test_util::SetBiomol (rna_seq, CMolInfo::eBiomol_tRNA);
15101     seh = scope.AddTopLevelSeqEntry(*entry);
15102     //AddChromosomeNoLocation(expected_errors, entry);
15103     eval = validator.Validate(seh, options);
15104     CheckErrors (*eval, expected_errors);
15105 
15106     // also get errors for rRNA
15107     scope.RemoveTopLevelSeqEntry(seh);
15108     rna_feat->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
15109     rna_feat->SetData().SetRna().SetExt().SetName("a ribosomal RNA");
15110     seh = scope.AddTopLevelSeqEntry(*entry);
15111     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "RnaProductMismatch",
15112               "Type of RNA does not match MolInfo of product Bioseq"));
15113     eval = validator.Validate(seh, options);
15114     CheckErrors (*eval, expected_errors);
15115 
15116     // make error go away
15117     CLEAR_ERRORS
15118     scope.RemoveTopLevelSeqEntry(seh);
15119     unit_test_util::SetBiomol (rna_seq, CMolInfo::eBiomol_rRNA);
15120     seh = scope.AddTopLevelSeqEntry(*entry);
15121     //AddChromosomeNoLocation(expected_errors, entry);
15122     eval = validator.Validate(seh, options);
15123     CheckErrors (*eval, expected_errors);
15124 
15125     CLEAR_ERRORS
15126 }
15127 
15128 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MissingCDSproduct)15129 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MissingCDSproduct)
15130 {
15131     CRef<CSeq_entry> entry(new CSeq_entry());
15132     entry->SetSet().SetClass(CBioseq_set::eClass_genbank);
15133     CRef<CSeq_entry> nuc = unit_test_util::BuildGoodSeq();
15134     CRef<CSeq_feat> cds = unit_test_util::AddMiscFeature(nuc);
15135     cds->SetData().SetCdregion();
15136     cds->SetProduct().SetWhole().SetLocal().SetStr("not_present_ever");
15137     entry->SetSet().SetSeq_set().push_back (nuc);
15138     STANDARD_SETUP
15139 
15140     BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
15141     BOOST_CHECK_EQUAL(validator::HasNoStop(*cds, &scope), true);
15142 
15143     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "StartCodon",
15144               "Illegal start codon used. Wrong genetic code [0] or protein should be partial"));
15145     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoStop",
15146               "Missing stop codon"));
15147     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MissingCDSproduct",
15148               "Unable to find product Bioseq from CDS feature"));
15149     //AddChromosomeNoLocation(expected_errors, entry);
15150     eval = validator.Validate(seh, options);
15151     CheckErrors (*eval, expected_errors);
15152 
15153     cds->ResetProduct();
15154     expected_errors[2]->SetErrMsg("Expected CDS product absent");
15155     expected_errors[2]->SetSeverity(eDiag_Error);
15156     eval = validator.Validate(seh, options);
15157     CheckErrors (*eval, expected_errors);
15158 
15159     // ok if pseudo
15160     CLEAR_ERRORS
15161     cds->SetPseudo(true);
15162     //AddChromosomeNoLocation(expected_errors, entry);
15163     eval = validator.Validate(seh, options);
15164     CheckErrors (*eval, expected_errors);
15165 
15166     // also ok if exception
15167     cds->ResetPseudo();
15168     cds->SetExcept(true);
15169     cds->SetExcept_text("rearrangement required for product");
15170     eval = validator.Validate(seh, options);
15171     CheckErrors (*eval, expected_errors);
15172 
15173     // also ok if CDS contains just stop codon
15174     scope.RemoveTopLevelSeqEntry(seh);
15175     cds->ResetExcept();
15176     cds->ResetExcept_text();
15177     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATAAGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAA");
15178     cds->SetLocation().SetInt().SetTo(4);
15179     cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
15180     cds->SetPartial(true);
15181     cds->SetData().SetCdregion().SetFrame(CCdregion::eFrame_three);
15182     seh = scope.AddTopLevelSeqEntry(*entry);
15183     eval = validator.Validate(seh, options);
15184     CheckErrors (*eval, expected_errors);
15185 
15186     CLEAR_ERRORS
15187 }
15188 
15189 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadTrnaCodon)15190 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadTrnaCodon)
15191 {
15192     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
15193     CRef<CSeq_feat> trna = unit_test_util::BuildGoodtRNA(entry->SetSeq().SetId().front());
15194     trna->SetData().SetRna().SetExt().SetTRNA().SetCodon().push_back(64);
15195     unit_test_util::AddFeat (trna, entry);
15196 
15197     STANDARD_SETUP
15198     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadTrnaCodon",
15199                   "tRNA codon value 64 is greater than maximum 63"));
15200     //AddChromosomeNoLocation(expected_errors, entry);
15201     eval = validator.Validate(seh, options);
15202     CheckErrors (*eval, expected_errors);
15203 
15204     CLEAR_ERRORS
15205 }
15206 
15207 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadTrnaAA)15208 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadTrnaAA)
15209 {
15210     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
15211     CRef<CSeq_feat> trna = unit_test_util::BuildGoodtRNA(entry->SetSeq().SetId().front());
15212     trna->SetData().SetRna().SetExt().SetTRNA().ResetAa();
15213     unit_test_util::AddFeat (trna, entry);
15214 
15215     STANDARD_SETUP
15216     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadTrnaAA",
15217                   "Missing tRNA amino acid"));
15218     //AddChromosomeNoLocation(expected_errors, entry);
15219     eval = validator.Validate(seh, options);
15220     CheckErrors (*eval, expected_errors);
15221 
15222     CLEAR_ERRORS
15223     trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa(29);
15224 
15225     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadAnticodonAA",
15226                   "Codons predicted from anticodon (AAA) cannot produce amino acid ( /OTHER)"));
15227     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadTrnaAA",
15228                   "Invalid tRNA amino acid"));
15229     //AddChromosomeNoLocation(expected_errors, entry);
15230     eval = validator.Validate(seh, options);
15231     CheckErrors (*eval, expected_errors);
15232 
15233     CLEAR_ERRORS
15234 }
15235 
15236 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_OnlyGeneXrefs)15237 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_OnlyGeneXrefs)
15238 {
15239     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
15240     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
15241     feat->SetGeneXref().SetLocus("foo");
15242 
15243     STANDARD_SETUP
15244     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GeneXrefWithoutGene",
15245                   "Feature has gene locus cross-reference but no equivalent gene feature exists"));
15246     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OnlyGeneXrefs",
15247                   "There are 1 gene xrefs and no gene features in this record."));
15248     //AddChromosomeNoLocation(expected_errors, entry);
15249     eval = validator.Validate(seh, options);
15250     CheckErrors (*eval, expected_errors);
15251     CLEAR_ERRORS
15252 }
15253 
15254 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UTRdoesNotAbutCDS)15255 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UTRdoesNotAbutCDS)
15256 {
15257     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
15258     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet (entry);
15259     CRef<CSeq_entry> nseq = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
15260     CRef<CSeq_entry> pseq = unit_test_util::GetProteinSequenceFromGoodNucProtSet(entry);
15261     cds->SetLocation().SetInt().SetFrom(3);
15262     nseq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("CCCATGAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
15263     pseq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MRKTEIN");
15264     pseq->SetSeq().SetInst().SetLength(7);
15265     unit_test_util::AdjustProtFeatForNucProtSet (entry);
15266 
15267     CRef<CSeq_feat> utr5 = unit_test_util::AddMiscFeature(nseq);
15268     utr5->SetData().SetImp().SetKey("5'UTR");
15269     utr5->SetLocation().SetInt().SetTo(1);
15270 
15271     CRef<CSeq_feat> utr3 = unit_test_util::AddMiscFeature(nseq);
15272     utr3->SetData().SetImp().SetKey("3'UTR");
15273     utr3->SetLocation().SetInt().SetFrom(28);
15274     utr3->SetLocation().SetInt().SetTo(59);
15275 
15276     STANDARD_SETUP
15277 
15278     expected_errors.push_back(new CExpectedError("lcl|nuc",eDiag_Warning,"UTRdoesNotAbutCDS",
15279                               "5'UTR does not abut CDS"));
15280     expected_errors.push_back(new CExpectedError("lcl|nuc",eDiag_Warning,"UTRdoesNotAbutCDS",
15281                               "CDS does not abut 3'UTR"));
15282     //AddChromosomeNoLocation(expected_errors, entry);
15283 
15284     eval = validator.Validate(seh, options);
15285     CheckErrors (*eval, expected_errors);
15286 
15287     scope.RemoveTopLevelSeqEntry(seh);
15288     utr5->SetLocation().SetInt().SetTo(2);
15289     utr5->SetLocation().SetInt().SetStrand(eNa_strand_minus);
15290     utr3->SetLocation().SetInt().SetFrom(27);
15291     utr3->SetLocation().SetInt().SetStrand(eNa_strand_minus);
15292     seh = scope.AddTopLevelSeqEntry(*entry);
15293 
15294     expected_errors[0]->SetErrMsg("5'UTR is not on plus strand");
15295     expected_errors[1]->SetErrMsg("3'UTR is not on plus strand");
15296     eval = validator.Validate(seh, options);
15297     CheckErrors (*eval, expected_errors);
15298 
15299     scope.RemoveTopLevelSeqEntry(seh);
15300     unit_test_util::RevComp(entry);
15301     seh = scope.AddTopLevelSeqEntry(*entry);
15302     expected_errors[0]->SetErrMsg("3'UTR is not on minus strand");
15303     expected_errors[1]->SetErrMsg("5'UTR is not on minus strand");
15304     eval = validator.Validate(seh, options);
15305     CheckErrors (*eval, expected_errors);
15306 
15307     CLEAR_ERRORS
15308 
15309 
15310 }
15311 
15312 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ExceptionProblem)15313 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ExceptionProblem)
15314 {
15315     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
15316     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
15317 
15318     STANDARD_SETUP
15319 
15320     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ExceptionProblem", "Exception explanation text is also found in feature comment"));
15321     //AddChromosomeNoLocation(expected_errors, entry);
15322 
15323     feat->SetExcept(true);
15324 
15325     // look for exception in comment
15326     feat->SetExcept_text("RNA editing");
15327     feat->SetComment("RNA editing");
15328     eval = validator.Validate(seh, options);
15329     CheckErrors (*eval, expected_errors);
15330 
15331     // look for one exception in comment
15332     feat->SetExcept_text("RNA editing, rearrangement required for product");
15333     eval = validator.Validate(seh, options);
15334     CheckErrors (*eval, expected_errors);
15335 
15336     // no citation
15337     feat->SetExcept_text("reasons given in citation");
15338     expected_errors[0]->SetErrMsg("Reasons given in citation exception does not have the required citation");
15339     eval = validator.Validate(seh, options);
15340     CheckErrors (*eval, expected_errors);
15341 
15342     // no inference
15343     feat->SetExcept_text("annotated by transcript or proteomic data");
15344     expected_errors[0]->SetErrMsg("Annotated by transcript or proteomic data exception does not have the required inference qualifier");
15345     eval = validator.Validate(seh, options);
15346     CheckErrors (*eval, expected_errors);
15347 
15348     // not legal
15349     feat->SetExcept_text("not a legal exception");
15350     expected_errors[0]->SetErrMsg("not a legal exception is not a legal exception explanation");
15351     expected_errors[0]->SetSeverity(eDiag_Error);
15352     eval = validator.Validate(seh, options);
15353     CheckErrors (*eval, expected_errors);
15354 
15355     // change to ref-seq
15356     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
15357     scope.RemoveTopLevelSeqEntry(seh);
15358     seh = scope.AddTopLevelSeqEntry(*entry);
15359     feat->SetLocation().SetInt().SetId().SetOther().SetAccession("NC_123456");
15360 
15361 
15362     // multiple ref-seq exceptions
15363     feat->SetExcept_text("unclassified transcription discrepancy, RNA editing");
15364     feat->SetComment("misc_feature needs a comment");
15365     expected_errors[0]->SetErrMsg("Genome processing exception should not be combined with other explanations");
15366     expected_errors[0]->SetSeverity(eDiag_Warning);
15367     ChangeErrorAcc(expected_errors, "ref|NC_123456|");
15368     eval = validator.Validate(seh, options);
15369     CheckErrors (*eval, expected_errors);
15370 
15371     CLEAR_ERRORS
15372     // not legal (is warning for NC or NT)
15373     feat->SetExcept_text("not a legal exception");
15374     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "ExceptionProblem", "not a legal exception is not a legal exception explanation"));
15375     //AddChromosomeNoLocation(expected_errors, entry);
15376     eval = validator.Validate(seh, options);
15377     CheckErrors (*eval, expected_errors);
15378 
15379     CLEAR_ERRORS
15380 
15381     // these are now legal for RefSeq
15382     feat->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
15383     feat->SetData().SetRna().SetExt().SetName("23S ribosomal RNA");
15384     feat->ResetComment();
15385     feat->SetExcept_text("23S ribosomal RNA and 5S ribosomal RNA overlap");
15386     //AddChromosomeNoLocation(expected_errors, entry);
15387     eval = validator.Validate(seh, options);
15388     CheckErrors (*eval, expected_errors);
15389     feat->SetExcept_text("5S ribosomal RNA and 16S ribosomal RNA overlap");
15390     eval = validator.Validate(seh, options);
15391     CheckErrors (*eval, expected_errors);
15392     feat->SetExcept_text("5S ribosomal RNA and 23S ribosomal RNA overlap");
15393     eval = validator.Validate(seh, options);
15394     CheckErrors (*eval, expected_errors);
15395     feat->SetExcept_text("23S ribosomal RNA and 16S ribosomal RNA overlap");
15396     eval = validator.Validate(seh, options);
15397     CheckErrors (*eval, expected_errors);
15398 
15399     CLEAR_ERRORS
15400 }
15401 
15402 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SeqDataLenWrong)15403 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SeqDataLenWrong)
15404 {
15405     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
15406 
15407     CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
15408     // need to call this statement before calling AddDefaults
15409     // to make sure that we can fetch the sequence referenced by the
15410     // delta sequence so that we can detect that the loc in the
15411     // delta sequence is longer than the referenced sequence
15412     CGBDataLoader::RegisterInObjectManager(*objmgr);
15413     CScope scope(*objmgr);
15414     scope.AddDefaults();
15415 
15416     CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
15417 
15418     CValidator validator(*objmgr);
15419 
15420     // Set validator options
15421     unsigned int options = CValidator::eVal_need_isojta
15422                           | CValidator::eVal_far_fetch_mrna_products
15423                           | CValidator::eVal_validate_id_set | CValidator::eVal_indexer_version
15424                           | CValidator::eVal_use_entrez;
15425 
15426     // list of expected errors
15427     vector< CExpectedError *> expected_errors;
15428 
15429     // validate - should be fine
15430     //AddChromosomeNoLocation(expected_errors, entry);
15431     CConstRef<CValidError> eval = validator.Validate(seh, options);
15432     CheckErrors (*eval, expected_errors);
15433 
15434     // longer and shorter for iupacna
15435     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "SeqDataLenWrong", "Bioseq.seq_data too short [60] for given length [65]"));
15436     entry->SetSeq().SetInst().SetLength(65);
15437     eval = validator.Validate(seh, options);
15438     CheckErrors (*eval, expected_errors);
15439 
15440     entry->SetSeq().SetInst().SetLength(55);
15441     expected_errors[0]->SetErrMsg("Bioseq.seq_data is larger [60] than given length [55]");
15442     eval = validator.Validate(seh, options);
15443     CheckErrors (*eval, expected_errors);
15444 
15445     // try other divisors
15446     entry->SetSeq().SetInst().SetLength(60);
15447     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('A');
15448     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('T');
15449     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('G');
15450     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set().push_back('C');
15451     CRef<CSeq_data> packed_data(new CSeq_data);
15452     // convert seq data to another format
15453     // (NCBI2na = 2 bit nucleic acid code)
15454     CSeqportUtil::Convert(entry->SetSeq().SetInst().GetSeq_data(),
15455                           packed_data,
15456                           CSeq_data::e_Ncbi2na);
15457     entry->SetSeq().SetInst().SetSeq_data(*packed_data);
15458     expected_errors[0]->SetErrMsg("Bioseq.seq_data is larger [64] than given length [60]");
15459     eval = validator.Validate(seh, options);
15460     CheckErrors (*eval, expected_errors);
15461 
15462     entry->SetSeq().SetInst().SetSeq_data().SetNcbi2na().Set().pop_back();
15463     entry->SetSeq().SetInst().SetSeq_data().SetNcbi2na().Set().pop_back();
15464     expected_errors[0]->SetErrMsg("Bioseq.seq_data too short [56] for given length [60]");
15465     eval = validator.Validate(seh, options);
15466     CheckErrors (*eval, expected_errors);
15467 
15468     CSeqportUtil::Convert(entry->SetSeq().SetInst().GetSeq_data(),
15469                           packed_data,
15470                           CSeq_data::e_Ncbi4na);
15471     entry->SetSeq().SetInst().SetSeq_data(*packed_data);
15472     eval = validator.Validate(seh, options);
15473     CheckErrors (*eval, expected_errors);
15474 
15475     entry->SetSeq().SetInst().SetSeq_data().SetNcbi4na().Set().push_back('1');
15476     entry->SetSeq().SetInst().SetSeq_data().SetNcbi4na().Set().push_back('8');
15477     entry->SetSeq().SetInst().SetSeq_data().SetNcbi4na().Set().push_back('1');
15478     entry->SetSeq().SetInst().SetSeq_data().SetNcbi4na().Set().push_back('8');
15479     expected_errors[0]->SetErrMsg("Bioseq.seq_data is larger [64] than given length [60]");
15480     eval = validator.Validate(seh, options);
15481     CheckErrors (*eval, expected_errors);
15482 
15483     CRef<CSeq_id> id(new CSeq_id("gb|AY123456"));
15484 #if 0
15485     // removed per VR-779
15486     // now try seg and ref
15487     entry->SetSeq().SetInst().ResetSeq_data();
15488     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_seg);
15489     CRef<CSeq_loc> loc(new CSeq_loc(*id, 0, 55));
15490     entry->SetSeq().SetInst().SetExt().SetSeg().Set().push_back(loc);
15491     expected_errors[0]->SetErrMsg("Bioseq.seq_data too short [56] for given length [60]");
15492     eval = validator.Validate(seh, options);
15493     CheckErrors (*eval, expected_errors);
15494 
15495     loc->SetInt().SetTo(63);
15496     expected_errors[0]->SetErrMsg("Bioseq.seq_data is larger [64] than given length [60]");
15497     eval = validator.Validate(seh, options);
15498     CheckErrors (*eval, expected_errors);
15499 
15500     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_ref);
15501     entry->SetSeq().SetInst().SetExt().SetRef().SetInt().SetId(*id);
15502     entry->SetSeq().SetInst().SetExt().SetRef().SetInt().SetFrom(0);
15503     entry->SetSeq().SetInst().SetExt().SetRef().SetInt().SetTo(55);
15504     expected_errors[0]->SetErrMsg("Bioseq.seq_data too short [56] for given length [60]");
15505     eval = validator.Validate(seh, options);
15506     CheckErrors (*eval, expected_errors);
15507 
15508     entry->SetSeq().SetInst().SetExt().SetRef().SetInt().SetTo(63);
15509     expected_errors[0]->SetErrMsg("Bioseq.seq_data is larger [64] than given length [60]");
15510     eval = validator.Validate(seh, options);
15511     CheckErrors (*eval, expected_errors);
15512 #endif
15513 
15514     CLEAR_ERRORS
15515     entry->SetSeq().SetInst().ResetSeq_data();
15516     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "SeqDataLenWrong",
15517         "Bioseq.seq_data too short [56] for given length [60]"));
15518     //AddChromosomeNoLocation(expected_errors, entry);
15519     // delta sequence
15520     entry->SetSeq().SetInst().SetRepr(CSeq_inst::eRepr_delta);
15521     entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*id, 0, 55);
15522     eval = validator.Validate(seh, options);
15523     CheckErrors (*eval, expected_errors);
15524     entry->SetSeq().SetInst().SetExt().Reset();
15525     entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*id, 0, 30);
15526     entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*id, 40, 72);
15527     expected_errors[0]->SetErrMsg("Bioseq.seq_data is larger [64] than given length [60]");
15528     eval = validator.Validate(seh, options);
15529     CheckErrors (*eval, expected_errors);
15530 
15531     entry->SetSeq().SetInst().SetExt().Reset();
15532     entry->SetSeq().SetInst().SetExt().SetDelta().AddSeqRange(*id, 0, 59);
15533     CRef<CDelta_seq> delta_seq;
15534     entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(delta_seq);
15535     expected_errors[0]->SetErrMsg("NULL pointer in delta seq_ext valnode (segment 2)");
15536     expected_errors[0]->SetSeverity(eDiag_Error);
15537     eval = validator.Validate(seh, options);
15538     CheckErrors (*eval, expected_errors);
15539 
15540     entry->SetSeq().SetInst().SetExt().Reset();
15541     CRef<CDelta_seq> delta_seq2(new CDelta_seq());
15542     delta_seq2->SetLoc().SetInt().SetId(*id);
15543     delta_seq2->SetLoc().SetInt().SetFrom(0);
15544     delta_seq2->SetLoc().SetInt().SetTo(485);
15545     entry->SetSeq().SetInst().SetLength(486);
15546     entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(delta_seq2);
15547     expected_errors[0]->SetErrMsg("Seq-loc extent (486) greater than length of gb|AY123456| (485)");
15548     expected_errors[0]->SetSeverity(eDiag_Critical);
15549     eval = validator.Validate(seh, options);
15550     CheckErrors (*eval, expected_errors);
15551 
15552     CLEAR_ERRORS
15553 }
15554 
15555 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadConflictFlag)15556 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadConflictFlag)
15557 {
15558     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
15559     CRef<CSeq_feat> cds_feat = unit_test_util::GetCDSFromGoodNucProtSet(entry);
15560     cds_feat->SetData().SetCdregion().SetConflict(true);
15561 
15562     STANDARD_SETUP
15563 
15564     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "BadConflictFlag",
15565                               "Coding region conflict flag should not be set"));
15566     //AddChromosomeNoLocation(expected_errors, entry);
15567     eval = validator.Validate(seh, options);
15568     CheckErrors (*eval, expected_errors);
15569 
15570     CLEAR_ERRORS
15571 }
15572 
15573 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ConflictFlagSet)15574 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ConflictFlagSet)
15575 {
15576     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
15577     CRef<CSeq_feat> cds_feat = unit_test_util::GetCDSFromGoodNucProtSet(entry);
15578     cds_feat->SetData().SetCdregion().SetConflict(true);
15579     CRef<CSeq_entry> prot = entry->SetSet().SetSeq_set().back();
15580     prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKTEIXX");
15581     prot->SetSeq().SetInst().SetLength(9);
15582     CRef<CSeq_feat> prot_feat = prot->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
15583     prot_feat->SetLocation().SetInt().SetTo(8);
15584 
15585     STANDARD_SETUP
15586 
15587     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "ConflictFlagSet",
15588                               "Coding region conflict flag is set"));
15589     //AddChromosomeNoLocation(expected_errors, entry);
15590     eval = validator.Validate(seh, options);
15591     CheckErrors (*eval, expected_errors);
15592 
15593     CLEAR_ERRORS
15594 }
15595 
15596 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_LocusTagProblem)15597 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_LocusTagProblem)
15598 {
15599     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
15600     CRef<CSeq_feat>gene = unit_test_util::AddMiscFeature(entry);
15601     gene->SetData().SetGene().SetLocus_tag("a b c");
15602 
15603     STANDARD_SETUP
15604 
15605     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "LocusTagHasSpace",
15606                               "Gene locus_tag 'a b c' should be a single word without any spaces"));
15607     //AddChromosomeNoLocation(expected_errors, entry);
15608     eval = validator.Validate(seh, options);
15609     CheckErrors (*eval, expected_errors);
15610 
15611     gene->AddQualifier("old_locus_tag", "a b c");
15612     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "RedundantFields",
15613                        "old_locus_tag has same value as gene locus_tag"));
15614     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Error, "LocusTagProblem",
15615                        "Gene locus_tag and old_locus_tag 'a b c' match"));
15616     eval = validator.Validate(seh, options);
15617     CheckErrors (*eval, expected_errors);
15618 
15619     CLEAR_ERRORS
15620     gene->ResetQual();
15621     gene->SetData().SetGene().SetLocus_tag("abc");
15622     gene->SetData().SetGene().SetLocus("abc");
15623     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Error, "LocusTagGeneLocusMatch",
15624                        "Gene locus and locus_tag 'abc' match"));
15625     //AddChromosomeNoLocation(expected_errors, entry);
15626     eval = validator.Validate(seh, options);
15627     CheckErrors (*eval, expected_errors);
15628 
15629     CLEAR_ERRORS
15630     gene->SetData().SetGene().ResetLocus();
15631     gene->AddQualifier ("old_locus_tag", "a, b, c");
15632     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "OldLocusTagBadFormat",
15633                        "old_locus_tag has comma, multiple old_locus_tags should be split into separate qualifiers"));
15634     //AddChromosomeNoLocation(expected_errors, entry);
15635     eval = validator.Validate(seh, options);
15636     CheckErrors (*eval, expected_errors);
15637 
15638     CLEAR_ERRORS
15639 }
15640 
15641 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_AltStartCodonException)15642 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_AltStartCodonException)
15643 {
15644     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
15645     CRef<CSeq_entry> nseq = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
15646     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
15647     cds->SetExcept(true);
15648     cds->SetExcept_text("alternative start codon");
15649 
15650     STANDARD_SETUP
15651 
15652     // first, no errors because not refseq
15653     //AddChromosomeNoLocation(expected_errors, entry);
15654     eval = validator.Validate(seh, options);
15655     CheckErrors (*eval, expected_errors);
15656 
15657     CLEAR_ERRORS
15658 
15659     // report error if refseq
15660     scope.RemoveTopLevelSeqEntry(seh);
15661     nseq->SetSeq().SetId().front()->SetOther().SetAccession("NM_123456");
15662     cds->SetLocation().SetInt().SetId().SetOther().SetAccession("NM_123456");
15663     seh = scope.AddTopLevelSeqEntry(*entry);
15664     expected_errors.push_back(new CExpectedError("ref|NM_123456|", eDiag_Warning, "AltStartCodonException",
15665                               "Unnecessary alternative start codon exception"));
15666     //AddChromosomeNoLocation(expected_errors, entry);
15667     eval = validator.Validate(seh, options);
15668     CheckErrors (*eval, expected_errors);
15669 
15670     CLEAR_ERRORS
15671 }
15672 
15673 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GenesInconsistent)15674 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GenesInconsistent)
15675 {
15676     CRef<CSeq_entry> entry = unit_test_util::BuildGoodGenProdSet();
15677     CRef<CSeq_entry> np = unit_test_util::GetNucProtSetFromGenProdSet(entry);
15678     CRef<CSeq_entry> mrna_seq = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(np);
15679     CRef<CSeq_feat> mgene = unit_test_util::AddMiscFeature(mrna_seq);
15680     mgene->SetLocation().SetInt().SetTo(26);
15681     mgene->SetData().SetGene().SetLocus("locus1");
15682 
15683     CRef<CSeq_entry> g_seq = unit_test_util::GetGenomicFromGenProdSet(entry);
15684     CRef<CSeq_feat> cgene = unit_test_util::AddMiscFeature (g_seq);
15685     cgene->SetLocation().SetInt().SetTo(26);
15686     cgene->SetData().SetGene().SetLocus("locus2");
15687 
15688     STANDARD_SETUP
15689 
15690     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "GenesInconsistent",
15691                               "Gene on mRNA bioseq does not match gene on genomic bioseq"));
15692     //AddChromosomeNoLocation(expected_errors, entry);
15693     eval = validator.Validate(seh, options);
15694     CheckErrors (*eval, expected_errors);
15695 
15696     CLEAR_ERRORS
15697 }
15698 
15699 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateTranslExcept)15700 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateTranslExcept)
15701 {
15702     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
15703     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
15704     CRef<CCode_break> codebreak1(new CCode_break());
15705     codebreak1->SetLoc().SetInt().SetId().SetLocal().SetStr("nuc");
15706     codebreak1->SetLoc().SetInt().SetFrom(24);
15707     codebreak1->SetLoc().SetInt().SetTo(26);
15708     cds->SetData().SetCdregion().SetCode_break().push_back(codebreak1);
15709     CRef<CCode_break> codebreak2(new CCode_break());
15710     codebreak2->SetLoc().SetInt().SetId().SetLocal().SetStr("nuc");
15711     codebreak2->SetLoc().SetInt().SetFrom(24);
15712     codebreak2->SetLoc().SetInt().SetTo(26);
15713     cds->SetData().SetCdregion().SetCode_break().push_back(codebreak2);
15714 
15715     STANDARD_SETUP
15716 
15717     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "DuplicateTranslExcept",
15718                                "Multiple code-breaks at same location [lcl|nuc:25-27]"));
15719     //AddChromosomeNoLocation(expected_errors, entry);
15720     eval = validator.Validate(seh, options);
15721     CheckErrors (*eval, expected_errors);
15722 
15723     CLEAR_ERRORS
15724 }
15725 
15726 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TranslExceptAndRnaEditing)15727 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TranslExceptAndRnaEditing)
15728 {
15729     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
15730     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
15731     CRef<CCode_break> codebreak1(new CCode_break());
15732     codebreak1->SetLoc().SetInt().SetId().SetLocal().SetStr("nuc");
15733     codebreak1->SetLoc().SetInt().SetFrom(24);
15734     codebreak1->SetLoc().SetInt().SetTo(26);
15735     cds->SetData().SetCdregion().SetCode_break().push_back(codebreak1);
15736     cds->SetExcept(true);
15737     cds->SetExcept_text("RNA editing");
15738 
15739     STANDARD_SETUP
15740 
15741     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranslExceptAndRnaEditing",
15742                                "CDS has both RNA editing /exception and /transl_except qualifiers"));
15743     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
15744                                "CDS has exception but passes translation test"));
15745     //AddChromosomeNoLocation(expected_errors, entry);
15746     eval = validator.Validate(seh, options);
15747     CheckErrors (*eval, expected_errors);
15748 
15749     BOOST_CHECK_EQUAL(validator::DoesFeatureHaveUnnecessaryException(*cds, scope), true);
15750 
15751     CLEAR_ERRORS
15752 }
15753 
15754 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NoNameForProtein)15755 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NoNameForProtein)
15756 {
15757     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
15758     CRef<CSeq_feat> prot_feat = unit_test_util::GetProtFeatFromGoodNucProtSet (entry);
15759     prot_feat->SetData().SetProt().ResetName();
15760     prot_feat->SetData().SetProt().SetDesc("protein description");
15761     STANDARD_SETUP
15762 
15763     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "NoNameForProtein",
15764                                "Protein feature has description but no name"));
15765     //AddChromosomeNoLocation(expected_errors, entry);
15766     eval = validator.Validate(seh, options);
15767     CheckErrors (*eval, expected_errors);
15768 
15769     prot_feat->SetData().SetProt().ResetDesc();
15770     prot_feat->SetData().SetProt().SetActivity().push_back ("activity");
15771     expected_errors[0]->SetErrMsg("Protein feature has function but no name");
15772     eval = validator.Validate(seh, options);
15773     CheckErrors (*eval, expected_errors);
15774 
15775     prot_feat->SetData().SetProt().ResetActivity();
15776     prot_feat->SetData().SetProt().SetEc().push_back("1.2.3.4");
15777     expected_errors[0]->SetErrMsg("Protein feature has EC number but no name");
15778     eval = validator.Validate(seh, options);
15779     CheckErrors (*eval, expected_errors);
15780 
15781     CLEAR_ERRORS
15782     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "ProtRefHasNoData",
15783                                "There is a protein feature where all fields are empty"));
15784     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "NoNameForProtein",
15785                                "Protein feature has no name"));
15786     //AddChromosomeNoLocation(expected_errors, entry);
15787 
15788     prot_feat->SetData().SetProt().ResetEc();
15789     eval = validator.Validate(seh, options);
15790     CheckErrors (*eval, expected_errors);
15791 
15792     CLEAR_ERRORS
15793 }
15794 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSmRNAmismatch)15795 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSmRNAmismatch)
15796 {
15797     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
15798     CRef<CSeq_entry> nseq = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
15799     CRef<CSeq_feat> cds1 = unit_test_util::GetCDSFromGoodNucProtSet(entry);
15800     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature (cds1);
15801     gene->SetLocation().SetInt().SetTo(40);
15802     unit_test_util::AddFeat(gene, nseq);
15803     CRef<CSeq_feat> mrna1 = unit_test_util::MakemRNAForCDS (cds1);
15804     mrna1->SetData().SetRna().SetExt().SetName("product 1");
15805     unit_test_util::AddFeat (mrna1, nseq);
15806 
15807     CRef<CSeq_feat>mrna2 = unit_test_util::MakemRNAForCDS (cds1);
15808     mrna2->SetData().SetRna().SetExt().SetName("product 2");
15809     mrna2->SetLocation().SetInt().SetTo(40);
15810     unit_test_util::AddFeat (mrna2, nseq);
15811 
15812     STANDARD_SETUP
15813     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNAmismatchCount",
15814                                "mRNA count (2) does not match CDS (1) count for gene"));
15815     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSwithMultipleMRNAs",
15816                               "CDS matches 2 mRNAs"));
15817     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNAMismatchLocation",
15818                               "No CDS location match for 1 mRNA"));
15819     //AddChromosomeNoLocation(expected_errors, entry);
15820 
15821     eval = validator.Validate(seh, options);
15822     CheckErrors (*eval, expected_errors);
15823 
15824     CLEAR_ERRORS
15825 }
15826 
15827 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnnecessaryException)15828 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnnecessaryException)
15829 {
15830     CRef<CSeq_entry> entry = unit_test_util::BuildGoodGenProdSet();
15831     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGenProdSet(entry);
15832     CRef<CSeq_feat> mrna = unit_test_util::GetmRNAFromGenProdSet (entry);
15833     cds->SetExcept(true);
15834     cds->SetExcept_text("RNA editing");
15835     mrna->SetExcept(true);
15836     mrna->SetExcept_text("transcribed product replaced");
15837 
15838     STANDARD_SETUP
15839     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnnecessaryException",
15840                                "CDS has exception but passes translation test"));
15841     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "UnnecessaryException",
15842                                "mRNA has exception but passes transcription test"));
15843     //AddChromosomeNoLocation(expected_errors, entry);
15844 
15845     eval = validator.Validate(seh, options);
15846     CheckErrors (*eval, expected_errors);
15847     BOOST_CHECK_EQUAL(DoesFeatureHaveUnnecessaryException(*cds, scope), true);
15848     BOOST_CHECK_EQUAL(DoesFeatureHaveUnnecessaryException(*mrna, scope), true);
15849 
15850     CLEAR_ERRORS
15851 
15852     scope.RemoveTopLevelSeqEntry(seh);
15853     entry = unit_test_util::BuildGoodNucProtSet();
15854     CRef<CSeq_entry> nuc_seq = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
15855     unit_test_util::SetSpliceForMixLoc (nuc_seq->SetSeq());
15856     cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
15857     cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc_seq->SetSeq().SetId().front()));
15858     mrna = unit_test_util::MakemRNAForCDS (cds);
15859     unit_test_util::AddFeat (mrna, nuc_seq);
15860     CRef<CSeq_feat> exon = unit_test_util::AddMiscFeature(nuc_seq);
15861     exon->SetData().SetImp().SetKey("exon");
15862     exon->SetLocation().Assign(*(cds->SetLocation().SetMix().Set().front()));
15863     cds->SetExcept(true);
15864     cds->SetExcept_text("artificial frameshift");
15865     mrna->SetExcept(true);
15866     mrna->SetExcept_text("artificial frameshift");
15867     exon->SetExcept(true);
15868     exon->SetExcept_text("artificial frameshift");
15869     seh = scope.AddTopLevelSeqEntry(*entry);
15870 
15871     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
15872                                "feature has exception but passes splice site test"));
15873     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
15874                                "feature has exception but passes splice site test"));
15875     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryException",
15876                                "feature has exception but passes splice site test"));
15877     //AddChromosomeNoLocation(expected_errors, entry);
15878 
15879     options |= CValidator::eVal_val_exons;
15880     eval = validator.Validate(seh, options);
15881     CheckErrors (*eval, expected_errors);
15882 
15883     BOOST_CHECK_EQUAL(DoesFeatureHaveUnnecessaryException(*cds, scope), true);
15884     BOOST_CHECK_EQUAL(DoesFeatureHaveUnnecessaryException(*mrna, scope), true);
15885     BOOST_CHECK_EQUAL(DoesFeatureHaveUnnecessaryException(*exon, scope), true);
15886     CLEAR_ERRORS
15887 }
15888 
15889 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_LocusTagProductMismatch)15890 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_LocusTagProductMismatch)
15891 {
15892     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
15893     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
15894     CRef<CSeq_entry> prot = unit_test_util::GetProteinSequenceFromGoodNucProtSet(entry);
15895     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
15896     CRef<CSeq_id> id(new CSeq_id());
15897     id->SetGeneral().SetDb("a");
15898     id->SetGeneral().SetTag().SetStr("good");
15899     unit_test_util::ChangeNucProtSetProteinId(entry, id);
15900     CRef<CSeq_id> lcl_id(new CSeq_id());
15901     lcl_id->SetLocal().SetStr("x");
15902     prot->SetSeq().SetId().push_back(lcl_id);
15903 
15904     CRef<CSeq_id> ref_id = unit_test_util::BuildRefSeqId();
15905     unit_test_util::ChangeNucProtSetNucId (entry, ref_id);
15906 
15907     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature(cds);
15908     gene->SetData().SetGene().SetLocus_tag("something");
15909     unit_test_util::AddFeat(gene, nuc);
15910 
15911     STANDARD_SETUP
15912 
15913     options |= CValidator::eVal_locus_tag_general_match;
15914     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "LocusTagProductMismatch",
15915                                "Gene locus_tag does not match general ID of product"));
15916     //AddChromosomeNoLocation(expected_errors, entry);
15917 
15918     eval = validator.Validate(seh, options);
15919     CheckErrors (*eval, expected_errors);
15920     CLEAR_ERRORS
15921 }
15922 
15923 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PseudoCdsViaGeneHasProduct)15924 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PseudoCdsViaGeneHasProduct)
15925 {
15926     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
15927     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
15928     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
15929     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature(cds);
15930     gene->SetPseudo(true);
15931     unit_test_util::AddFeat(gene, nuc);
15932 
15933     STANDARD_SETUP
15934 
15935     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Error, "PseudoCdsViaGeneHasProduct",
15936                                "A coding region overlapped by a pseudogene should not have a product"));
15937     //AddChromosomeNoLocation(expected_errors, entry);
15938 
15939     eval = validator.Validate(seh, options);
15940     CheckErrors (*eval, expected_errors);
15941     CLEAR_ERRORS
15942 }
15943 
15944 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MissingGeneXref)15945 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MissingGeneXref)
15946 {
15947     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
15948     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
15949     misc->SetLocation().SetInt().SetFrom (5);
15950 
15951     CRef<CSeq_feat> gene1 = unit_test_util::MakeGeneForFeature(misc);
15952     gene1->SetData().SetGene().SetLocus("first");
15953     gene1->SetLocation().SetInt().SetFrom (0);
15954     unit_test_util::AddFeat(gene1, entry);
15955     CRef<CSeq_feat> gene2 = unit_test_util::MakeGeneForFeature(misc);
15956     gene2->SetData().SetGene().SetLocus("second");
15957     gene2->SetLocation().SetInt().SetTo(misc->GetLocation().GetInt().GetTo() + 5);
15958     unit_test_util::AddFeat(gene2, entry);
15959 
15960     STANDARD_SETUP
15961 
15962     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "MissingGeneXref",
15963                                "Feature overlapped by 2 identical-length genes but has no cross-reference"));
15964     //AddChromosomeNoLocation(expected_errors, entry);
15965 
15966     eval = validator.Validate(seh, options);
15967     CheckErrors (*eval, expected_errors);
15968     CLEAR_ERRORS
15969 }
15970 
15971 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureCitationProblem)15972 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureCitationProblem)
15973 {
15974     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
15975     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
15976     CRef<CPub> pub(new CPub());
15977     pub->SetPmid(CPub::TPmid(ENTREZ_ID_CONST(2)));
15978     misc->SetCit().SetPub().push_back(pub);
15979 
15980     STANDARD_SETUP
15981 
15982     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "FeatureCitationProblem",
15983                                "Citation on feature refers to uid [2] not on a publication in the record"));
15984     //AddChromosomeNoLocation(expected_errors, entry);
15985 
15986     eval = validator.Validate(seh, options);
15987     CheckErrors (*eval, expected_errors);
15988     CLEAR_ERRORS
15989 }
15990 
15991 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NestedSeqLocMix)15992 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NestedSeqLocMix)
15993 {
15994     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
15995     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
15996     CRef<CSeq_loc> loc1(new CSeq_loc());
15997     loc1->SetInt().SetId().SetLocal().SetStr("good");
15998     loc1->SetInt().SetFrom(0);
15999     loc1->SetInt().SetTo(10);
16000     CRef<CSeq_loc> loc2(new CSeq_loc());
16001     loc2->SetInt().SetId().SetLocal().SetStr("good");
16002     loc2->SetInt().SetFrom(20);
16003     loc2->SetInt().SetTo(30);
16004     CRef<CSeq_loc> loc3(new CSeq_loc());
16005     loc3->SetInt().SetId().SetLocal().SetStr("good");
16006     loc3->SetInt().SetFrom(40);
16007     loc3->SetInt().SetTo(50);
16008     CRef<CSeq_loc> loc4(new CSeq_loc());
16009     loc4->SetMix().Set().push_back (loc2);
16010     loc4->SetMix().Set().push_back(loc3);
16011 
16012     misc->SetLocation().SetMix().Set().push_back (loc1);
16013     misc->SetLocation().SetMix().Set().push_back (loc4);
16014     misc->SetProduct().Assign (misc->SetLocation());
16015 
16016     STANDARD_SETUP
16017 
16018     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Error, "NestedSeqLocMix",
16019     "Location: SeqLoc [[lcl|good:1-11, [21-31, 41-51]]] has nested SEQLOC_MIX elements"));
16020     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Error, "NestedSeqLocMix",
16021                                "Product: SeqLoc [[lcl|good:1-11, [21-31, 41-51]]] has nested SEQLOC_MIX elements"));
16022     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Error, "SelfReferentialProduct",
16023                                "Self-referential feature product"));
16024     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "ProductShouldBeWhole",
16025                                "Feature products should be entire sequences."));
16026     //AddChromosomeNoLocation(expected_errors, entry);
16027     eval = validator.Validate(seh, options);
16028     CheckErrors (*eval, expected_errors);
16029     CLEAR_ERRORS
16030 }
16031 
16032 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CodonQualifierUsed)16033 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CodonQualifierUsed)
16034 {
16035     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
16036     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
16037     cds->AddQualifier("codon", "1");
16038 
16039     STANDARD_SETUP
16040 
16041     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Error, "CodonQualifierUsed",
16042                                "Use the proper genetic code, if available, or set transl_excepts on specific codons"));
16043 
16044     //AddChromosomeNoLocation(expected_errors, entry);
16045     eval = validator.Validate(seh, options);
16046     CheckErrors (*eval, expected_errors);
16047     CLEAR_ERRORS
16048 }
16049 
16050 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadCharInAuthorName)16051 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadCharInAuthorName)
16052 {
16053     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
16054     CRef<CSeqdesc> desc(new CSeqdesc());
16055     CRef<CPub> pub = unit_test_util::BuildGoodArticlePub();
16056     CRef<CAuthor> auth = unit_test_util::BuildGoodAuthor();
16057     auth->SetName().SetName().SetFirst("F1rst");
16058     pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(auth);
16059     desc->SetPub().SetPub().Set().push_back(pub);
16060     entry->SetSeq().SetDescr().Set().push_back (desc);
16061 
16062     STANDARD_SETUP
16063 
16064     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "BadCharInAuthorName",
16065                                "Bad characters in author F1rst"));
16066     //AddChromosomeNoLocation(expected_errors, entry);
16067 
16068     eval = validator.Validate(seh, options);
16069     CheckErrors (*eval, expected_errors);
16070     CLEAR_ERRORS
16071 }
16072 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PolyATail)16073 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PolyATail)
16074 {
16075     CRef<CSeq_entry> entry = unit_test_util::BuildGoodGenProdSet();
16076     CRef<CSeq_entry> contig = entry->SetSet().SetSeq_set().front();
16077     CRef<CSeq_feat> mrna = contig->SetSeq().SetAnnot().front()->SetData().SetFtable().back();
16078     mrna->SetLocation().SetInt().SetTo(25);
16079 
16080     STANDARD_SETUP
16081 
16082     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSwithNoMRNA",
16083                       "Unmatched CDS"));
16084     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
16085                       "No CDS location match for 1 mRNA"));
16086     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNArange",
16087                       "mRNA overlaps or contains CDS but does not completely contain intervals"));
16088     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "PolyATail",
16089                       "Transcript length [26] less than product length [27], but tail is 100% polyA"));
16090     //AddChromosomeNoLocation(expected_errors, entry);
16091     eval = validator.Validate(seh, options);
16092     CheckErrors (*eval, expected_errors);
16093 
16094     scope.RemoveTopLevelSeqEntry(seh);
16095     CRef<CSeq_entry> np = unit_test_util::GetNucProtSetFromGenProdSet(entry);
16096     CRef<CSeq_entry> transcript = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(np);
16097     transcript->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATAAACTAAAAAAAAAAAAAAAAAATAA");
16098     transcript->SetSeq().SetInst().SetLength(46);
16099     seh = scope.AddTopLevelSeqEntry(*entry);
16100     expected_errors[3]->SetErrMsg("Transcript length [26] less than product length [46], but tail >= 95% polyA");
16101     eval = validator.Validate(seh, options);
16102     CheckErrors (*eval, expected_errors);
16103     CLEAR_ERRORS
16104 }
16105 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSwithMultipleMRNAs)16106 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSwithMultipleMRNAs)
16107 {
16108     CRef<CSeq_entry> entry = unit_test_util::BuildGoodGenProdSet();
16109     CRef<CSeq_entry> genomic = unit_test_util::GetGenomicFromGenProdSet(entry);
16110     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGenProdSet(entry);
16111     CRef<CSeq_feat> second_mrna = unit_test_util::MakemRNAForCDS(cds);
16112     second_mrna->SetProduct().SetWhole().SetLocal().SetStr("nuc");
16113     unit_test_util::AddFeat (second_mrna, genomic);
16114 
16115     STANDARD_SETUP
16116     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatureProductInconsistency",
16117                               "mRNA products are not unique"));
16118     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSwithMultipleMRNAs",
16119                               "CDS matches 2 mRNAs"));
16120     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
16121                               "No CDS location match for 1 mRNA"));
16122     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatContentDup",
16123                               "Duplicate feature"));
16124     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "IdenticalMRNAtranscriptIDs",
16125                               "Identical transcript IDs found on multiple mRNAs"));
16126     //AddChromosomeNoLocation(expected_errors, entry);
16127 
16128     eval = validator.Validate(seh, options);
16129     CheckErrors (*eval, expected_errors);
16130     // now try with unique products
16131     scope.RemoveTopLevelSeqEntry(seh);
16132     CRef<CSeq_id> nuc_id(new CSeq_id());
16133     nuc_id->SetLocal().SetStr("nuc2");
16134     CRef<CSeq_id> prot_id(new CSeq_id());
16135     prot_id->SetLocal().SetStr("prot2");
16136     CRef<CSeq_entry> np = unit_test_util::BuildGenProdSetNucProtSet (nuc_id, prot_id);
16137     entry->SetSet().SetSeq_set().push_back (np);
16138     second_mrna->SetProduct().SetWhole().Assign(*nuc_id);
16139     seh = scope.AddTopLevelSeqEntry(*entry);
16140 
16141     CLEAR_ERRORS
16142 
16143     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "CDSwithMultipleMRNAs",
16144                               "CDS matches 2 mRNAs, but product locations are unique"));
16145     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
16146                               "No CDS location match for 1 mRNA"));
16147     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatContentDup", "Duplicate feature"));
16148     expected_errors.push_back(new CExpectedError("lcl|prot2", eDiag_Warning, "GenomicProductPackagingProblem",
16149                               "Protein bioseq should be product of CDS feature on contig, but is not"));
16150     //AddChromosomeNoLocation(expected_errors, entry);
16151 
16152     eval = validator.Validate(seh, options);
16153     CheckErrors (*eval, expected_errors);
16154     CLEAR_ERRORS
16155 }
16156 
16157 
TestMultipleEquivBioSources(const string & lineage,TSeqPos first_end,TSeqPos second_start,bool expected)16158 void TestMultipleEquivBioSources(const string& lineage, TSeqPos first_end, TSeqPos second_start, bool expected)
16159 {
16160     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
16161     CRef<CSeq_feat> src1 = unit_test_util::AddMiscFeature (entry);
16162     src1->SetData().SetBiosrc().SetOrg().SetTaxname("Homo sapiens");
16163     src1->SetData().SetBiosrc().SetOrg().SetOrgname().SetLineage(lineage);
16164     src1->SetLocation().SetInt().SetTo(first_end);
16165     CRef<CSeq_feat> src2 = unit_test_util::AddMiscFeature (entry);
16166     src2->SetData().SetBiosrc().SetOrg().SetTaxname("Homo sapiens");
16167     src2->SetData().SetBiosrc().SetOrg().SetOrgname().SetLineage(lineage);
16168     src2->SetLocation().SetInt().SetFrom(second_start);
16169     src2->SetLocation().SetInt().SetTo(second_start + 9);
16170     unit_test_util::SetTransgenic(entry, true);
16171 
16172     STANDARD_SETUP
16173 
16174     if (expected) {
16175         expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "MultipleEquivBioSources",
16176             "Multiple equivalent source features should be combined into one multi-interval feature"));
16177     }
16178     //AddChromosomeNoLocation(expected_errors, entry);
16179 
16180     options |= CValidator::eVal_seqsubmit_parent;
16181     eval = validator.Validate(seh, options);
16182     CheckErrors (*eval, expected_errors);
16183 
16184     CLEAR_ERRORS
16185 }
16186 
16187 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleEquivBioSources)16188 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleEquivBioSources)
16189 {
16190     // not expected, because not overlapping or abutting
16191     TestMultipleEquivBioSources("some lineage", 10, 15, false);
16192     // abutting, expected
16193     TestMultipleEquivBioSources("some lineage", 10, 11, true);
16194     // overlap, expected
16195     TestMultipleEquivBioSources("some lineage", 10, 8, true);
16196 
16197     // not expected for viruses
16198     TestMultipleEquivBioSources("Viruses", 10, 15, false);
16199     TestMultipleEquivBioSources("Viruses", 10, 11, false);
16200     TestMultipleEquivBioSources("Viruses", 10, 8, false);
16201 
16202 }
16203 
16204 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleEquivPublications)16205 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleEquivPublications)
16206 {
16207 
16208 
16209     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
16210     CRef<CSeq_feat> feat1 = unit_test_util::AddMiscFeature (entry);
16211     CRef<CPub> pub1(new CPub());
16212     pub1->SetPmid(CPub::TPmid(ENTREZ_ID_CONST(2)));
16213     feat1->SetData().SetPub().SetPub().Set().push_back(pub1);
16214     CRef<CSeq_feat> feat2 = unit_test_util::AddMiscFeature (entry);
16215     CRef<CPub> pub2(new CPub());
16216     pub2->SetPmid(CPub::TPmid(ENTREZ_ID_CONST(2)));
16217     feat2->SetData().SetPub().SetPub().Set().push_back(pub2);
16218     feat2->SetLocation().SetInt().SetFrom(30);
16219     feat2->SetLocation().SetInt().SetTo(40);
16220 
16221     STANDARD_SETUP
16222     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "MultipleEquivPublications",
16223                                "Multiple equivalent publication features should be combined into one multi-interval feature"));
16224     //AddChromosomeNoLocation(expected_errors, entry);
16225 
16226     eval = validator.Validate(seh, options);
16227     CheckErrors (*eval, expected_errors);
16228 
16229     CLEAR_ERRORS
16230 }
16231 
16232 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadFullLengthFeature)16233 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadFullLengthFeature)
16234 {
16235     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
16236     CRef<CSeq_feat> src1 = unit_test_util::AddMiscFeature (entry);
16237     src1->SetData().SetBiosrc().SetOrg().SetTaxname("Homo sapiens");
16238     src1->SetData().SetBiosrc().SetOrg().SetOrgname().SetLineage("some lineage");
16239     src1->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength() - 1);
16240     CRef<CSeq_feat> feat1 = unit_test_util::AddMiscFeature (entry);
16241     CRef<CPub> pub1(new CPub());
16242     pub1->SetPmid(CPub::TPmid(ENTREZ_ID_CONST(2)));
16243     feat1->SetData().SetPub().SetPub().Set().push_back(pub1);
16244     feat1->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength() - 1);
16245     unit_test_util::SetTransgenic(entry, true);
16246 
16247     STANDARD_SETUP
16248     // expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "BadFullLengthFeature",
16249     //                            "Source feature is full length, should be descriptor"));
16250     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "BadFullLengthFeature",
16251                                "Publication feature is full length, should be descriptor"));
16252     //AddChromosomeNoLocation(expected_errors, entry);
16253     options |= CValidator::eVal_seqsubmit_parent;
16254     eval = validator.Validate(seh, options);
16255     CheckErrors (*eval, expected_errors);
16256 
16257     CLEAR_ERRORS
16258 
16259     scope.RemoveTopLevelSeqEntry(seh);
16260     CRef<CSeq_feat> src2 = unit_test_util::AddMiscFeature (entry);
16261     src2->SetData().SetBiosrc().SetOrg().SetTaxname("Drosophila melanogaster");
16262     src2->SetData().SetBiosrc().SetOrg().SetOrgname().SetLineage("some lineage");
16263     src2->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength() - 1);
16264     seh = scope.AddTopLevelSeqEntry(*entry);
16265 
16266     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "DuplicateFeat",
16267                                "Features have identical intervals, but labels differ"));
16268     // expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "BadFullLengthFeature",
16269     //                            "Source feature is full length, should be descriptor"));
16270     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "BadFullLengthFeature",
16271                                "Multiple full-length source features, should only be one if descriptor is transgenic"));
16272     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "BadFullLengthFeature",
16273                                "Publication feature is full length, should be descriptor"));
16274     //AddChromosomeNoLocation(expected_errors, entry);
16275     eval = validator.Validate(seh, options);
16276     CheckErrors (*eval, expected_errors);
16277 
16278     CLEAR_ERRORS
16279 }
16280 
16281 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RedundantFields)16282 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RedundantFields)
16283 {
16284     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
16285     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
16286     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature (cds);
16287     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
16288     unit_test_util::AddFeat (gene, nuc);
16289     gene->SetData().SetGene().SetLocus ("redundant_g");
16290     gene->SetComment ("redundant_g");
16291 
16292     STANDARD_SETUP
16293 
16294     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "RedundantFields",
16295                                "Comment has same value as gene locus"));
16296     //AddChromosomeNoLocation(expected_errors, entry);
16297     eval = validator.Validate(seh, options);
16298     CheckErrors (*eval, expected_errors);
16299 
16300     CLEAR_ERRORS
16301     gene->SetData().SetGene().ResetLocus();
16302     gene->SetData().SetGene().SetLocus_tag("redundant_g");
16303     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "RedundantFields",
16304                                "Comment has same value as gene locus_tag"));
16305     //AddChromosomeNoLocation(expected_errors, entry);
16306     eval = validator.Validate(seh, options);
16307     CheckErrors (*eval, expected_errors);
16308 
16309     CLEAR_ERRORS
16310 
16311     gene->ResetComment();
16312     gene->AddQualifier("old_locus_tag", "redundant_g");
16313     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "RedundantFields",
16314                                "old_locus_tag has same value as gene locus_tag"));
16315     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Error, "LocusTagProblem",
16316                                "Gene locus_tag and old_locus_tag 'redundant_g' match"));
16317     //AddChromosomeNoLocation(expected_errors, entry);
16318 
16319     eval = validator.Validate(seh, options);
16320     CheckErrors (*eval, expected_errors);
16321 
16322     CLEAR_ERRORS
16323 
16324     gene->ResetQual();
16325 
16326     CRef<CSeq_feat> prot = unit_test_util::GetProtFeatFromGoodNucProtSet(entry);
16327     prot->SetData().SetProt().SetName().front().assign("redundant_p");
16328     prot->SetComment("redundant_p");
16329     prot->SetData().SetProt().SetDesc("redundant_p");
16330 
16331     expected_errors.push_back (new CExpectedError("lcl|prot", eDiag_Warning, "RedundantFields",
16332                                "Comment has same value as protein name"));
16333     expected_errors.push_back (new CExpectedError("lcl|prot", eDiag_Warning, "RedundantFields",
16334                                "Comment has same value as protein description"));
16335     //AddChromosomeNoLocation(expected_errors, entry);
16336     eval = validator.Validate(seh, options);
16337     CheckErrors (*eval, expected_errors);
16338 
16339     CLEAR_ERRORS
16340 }
16341 
16342 
AddCDSAndProtForBigGoodNucProtSet(CRef<CSeq_entry> entry,string nuc_id,string prot_id,TSeqPos offset)16343 static void AddCDSAndProtForBigGoodNucProtSet (CRef<CSeq_entry> entry, string nuc_id, string prot_id, TSeqPos offset)
16344 {
16345     CRef<CSeq_feat> cds (new CSeq_feat());
16346     cds->SetData().SetCdregion();
16347     cds->SetProduct().SetWhole().SetLocal().SetStr(prot_id);
16348     cds->SetLocation().SetInt().SetId().SetLocal().SetStr(nuc_id);
16349     cds->SetLocation().SetInt().SetFrom(offset + 0);
16350     cds->SetLocation().SetInt().SetTo(offset + 26);
16351     unit_test_util::AddFeat (cds, entry);
16352 
16353     CRef<CSeq_entry> pentry = unit_test_util::MakeProteinForGoodNucProtSet(prot_id);
16354 
16355     entry->SetSet().SetSeq_set().push_back(pentry);
16356 
16357 }
16358 
16359 
BuildBigGoodNucProtSet(void)16360 static CRef<CSeq_entry> BuildBigGoodNucProtSet(void)
16361 {
16362     CRef<CBioseq_set> set(new CBioseq_set());
16363     set->SetClass(CBioseq_set::eClass_nuc_prot);
16364 
16365     // make nucleotide
16366     CRef<CBioseq> nseq(new CBioseq());
16367     nseq->SetInst().SetMol(CSeq_inst::eMol_dna);
16368     nseq->SetInst().SetRepr(CSeq_inst::eRepr_raw);
16369     nseq->SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
16370     nseq->SetInst().SetLength(360);
16371 
16372     CRef<CSeq_id> id(new CSeq_id());
16373     id->SetLocal().SetStr ("nuc");
16374     nseq->SetId().push_back(id);
16375 
16376     CRef<CSeqdesc> mdesc(new CSeqdesc());
16377     mdesc->SetMolinfo().SetBiomol(CMolInfo::eBiomol_genomic);
16378     nseq->SetDescr().Set().push_back(mdesc);
16379 
16380     CRef<CSeq_entry> nentry(new CSeq_entry());
16381     nentry->SetSeq(*nseq);
16382 
16383     set->SetSeq_set().push_back(nentry);
16384 
16385     CRef<CSeq_entry> set_entry(new CSeq_entry());
16386     set_entry->SetSet(*set);
16387 
16388     int i = 1;
16389     for (TSeqPos offset = 0; offset < nseq->GetInst().GetLength() - 26; offset += 30, i++) {
16390         string prot_id = "prot" + NStr::IntToString(i);
16391         AddCDSAndProtForBigGoodNucProtSet (set_entry, "nuc", prot_id, offset);
16392     }
16393 
16394     unit_test_util::AddGoodSource (set_entry);
16395     unit_test_util::AddGoodPub(set_entry);
16396     return set_entry;
16397 }
16398 
16399 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSwithNoMRNA)16400 BOOST_AUTO_TEST_CASE (Test_SEQ_FEAT_CDSwithNoMRNA)
16401 {
16402     CRef<CSeq_entry> entry = BuildBigGoodNucProtSet();
16403     // make mRNA for first CDS
16404     CRef<CSeq_feat> first_cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
16405 
16406     CSeq_annot::TData::TFtable::iterator cds_it = entry->SetSet().SetAnnot().front()->SetData().SetFtable().begin();
16407 
16408     CRef<CSeq_feat> mrna = unit_test_util::MakemRNAForCDS (*cds_it);
16409     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
16410     unit_test_util::AddFeat (mrna, nuc);
16411     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature (*cds_it);
16412     unit_test_util::AddFeat (gene, nuc);
16413 
16414     STANDARD_SETUP
16415 
16416     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSwithNoMRNA",
16417                 "11 out of 12 CDSs unmatched"));
16418     //AddChromosomeNoLocation(expected_errors, entry);
16419 
16420     eval = validator.Validate(seh, options);
16421     CheckErrors (*eval, expected_errors);
16422     CLEAR_ERRORS
16423     scope.RemoveTopLevelSeqEntry(seh);
16424     for (int i = 0; i < 3; i++) {
16425         ++cds_it;
16426         CRef<CSeq_feat> new_mrna = unit_test_util::MakemRNAForCDS (*cds_it);
16427         unit_test_util::AddFeat (new_mrna, nuc);
16428     }
16429     seh = scope.AddTopLevelSeqEntry(*entry);
16430     for (int i = 0; i < 8; i++) {
16431         expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSwithNoMRNA",
16432                 "Unmatched CDS"));
16433     }
16434 
16435     //AddChromosomeNoLocation(expected_errors, entry);
16436     eval = validator.Validate(seh, options);
16437     CheckErrors (*eval, expected_errors);
16438 
16439     CLEAR_ERRORS
16440 }
16441 
16442 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_FEAT_FeatureProductInconsistency,CGenBankFixture)16443 BOOST_FIXTURE_TEST_CASE(Test_SEQ_FEAT_FeatureProductInconsistency, CGenBankFixture)
16444 {
16445     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
16446     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
16447     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
16448     CRef<CSeq_feat> mrna = unit_test_util::MakemRNAForCDS(cds);
16449     unit_test_util::AddFeat (mrna, nuc);
16450     CRef<CSeq_feat> bad_cds = unit_test_util::AddMiscFeature(nuc);
16451     bad_cds->SetData().SetCdregion();
16452     bad_cds->SetLocation().SetInt().SetFrom(30);
16453     bad_cds->SetLocation().SetInt().SetTo(56);
16454     CRef<CSeq_feat> bad_mrna = unit_test_util::MakemRNAForCDS(bad_cds);
16455     unit_test_util::AddFeat (bad_mrna, nuc);
16456 
16457     STANDARD_SETUP
16458 
16459     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "FeatureProductInconsistency",
16460                                 "2 CDS features have 1 product references"));
16461     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Error, "MissingCDSproduct", "Expected CDS product absent"));
16462     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Error, "NoProtein", "No protein Bioseq given"));
16463     //AddChromosomeNoLocation(expected_errors, entry);
16464     eval = validator.Validate(seh, options);
16465     CheckErrors (*eval, expected_errors);
16466 
16467     CLEAR_ERRORS
16468 
16469     scope.RemoveTopLevelSeqEntry(seh);
16470     bad_cds->SetProduct().SetWhole().SetLocal().SetStr("prot");
16471     seh = scope.AddTopLevelSeqEntry(*entry);
16472 
16473     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "FeatureProductInconsistency",
16474                                 "CDS products are not unique"));
16475     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Critical, "MultipleCDSproducts",
16476                                 "Same product Bioseq from multiple CDS features"));
16477     //AddChromosomeNoLocation(expected_errors, entry);
16478     eval = validator.Validate(seh, options);
16479     CheckErrors (*eval, expected_errors);
16480 
16481     CLEAR_ERRORS
16482 
16483     scope.RemoveTopLevelSeqEntry(seh);
16484     nuc->SetSeq().ResetAnnot();
16485     AddCDSAndProtForBigGoodNucProtSet (entry, "nuc", "prot1", 30);
16486     bad_mrna = unit_test_util::MakemRNAForCDS(entry->SetSet().SetAnnot().front()->SetData().SetFtable().back());
16487     unit_test_util::AddFeat (bad_mrna, nuc);
16488     mrna = unit_test_util::MakemRNAForCDS (cds);
16489     mrna->SetProduct().SetWhole().SetGenbank().SetAccession("AY123456");
16490     unit_test_util::AddFeat (mrna, nuc);
16491 
16492     seh = scope.AddTopLevelSeqEntry(*entry);
16493     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "FeatureProductInconsistency",
16494                                 "2 mRNA features have 1 product references"));
16495     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranscriptLen",
16496         "Transcript length [27] less than (far) product length [485], and tail < 95% polyA"));
16497     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranscriptMismatches",
16498         "There are 16 mismatches out of 27 bases between the transcript and (far) product sequence"));
16499     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
16500         "Inconsistent: Product= partial, Location= complete, Feature.partial= FALSE"));
16501     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "RnaProductMismatch",
16502         "Type of RNA does not match MolInfo of product Bioseq"));
16503     //AddChromosomeNoLocation(expected_errors, entry);
16504 
16505     eval = validator.Validate(seh, options);
16506     CheckErrors (*eval, expected_errors);
16507 
16508     CLEAR_ERRORS
16509 
16510     scope.RemoveTopLevelSeqEntry(seh);
16511     bad_mrna->SetProduct().SetWhole().SetGenbank().SetAccession("AY123456");
16512     seh = scope.AddTopLevelSeqEntry(*entry);
16513     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "FeatureProductInconsistency",
16514                                 "mRNA products are not unique"));
16515     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
16516         "Inconsistent: Product= partial, Location= complete, Feature.partial= FALSE"));
16517     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "IdenticalMRNAtranscriptIDs",
16518                     "Identical transcript IDs found on multiple mRNAs"));
16519     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranscriptLen",
16520         "Transcript length [27] less than (far) product length [485], and tail < 95% polyA"));
16521     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranscriptMismatches",
16522         "There are 16 mismatches out of 27 bases between the transcript and (far) product sequence"));
16523     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "RnaProductMismatch",
16524         "Type of RNA does not match MolInfo of product Bioseq"));
16525     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranscriptLen",
16526         "Transcript length [27] less than (far) product length [485], and tail < 95% polyA"));
16527     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "TranscriptMismatches",
16528         "There are 16 mismatches out of 27 bases between the transcript and (far) product sequence"));
16529     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
16530         "Inconsistent: Product= partial, Location= complete, Feature.partial= FALSE"));
16531     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "RnaProductMismatch",
16532         "Type of RNA does not match MolInfo of product Bioseq"));
16533     //AddChromosomeNoLocation(expected_errors, entry);
16534 
16535     eval = validator.Validate(seh, options);
16536     CheckErrors (*eval, expected_errors);
16537 
16538     CLEAR_ERRORS
16539 }
16540 
16541 
SetFeatureLocationBond(CRef<CSeq_feat> feat,string id,TSeqPos pt1,TSeqPos pt2)16542 static void SetFeatureLocationBond (CRef<CSeq_feat> feat, string id, TSeqPos pt1, TSeqPos pt2)
16543 {
16544     feat->SetLocation().SetBond().SetA().SetId().SetLocal().SetStr(id);
16545     feat->SetLocation().SetBond().SetA().SetPoint(0);
16546     feat->SetLocation().SetBond().SetB().SetId().SetLocal().SetStr(id);
16547     feat->SetLocation().SetBond().SetB().SetPoint(5);
16548 }
16549 
16550 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ImproperBondLocation)16551 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ImproperBondLocation)
16552 {
16553     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
16554     CRef<CSeq_feat> f1 = unit_test_util::AddMiscFeature(entry);
16555     SetFeatureLocationBond(f1, "good", 0, 5);
16556 
16557     CRef<CSeq_feat> f2 = unit_test_util::AddMiscFeature(entry);
16558     f2->SetData().SetHet();
16559     SetFeatureLocationBond(f2, "good", 0, 5);
16560 
16561     CRef<CSeq_feat> f3 = unit_test_util::AddMiscFeature(entry);
16562     f3->SetData().SetCdregion();
16563     f3->SetPseudo(true);
16564     SetFeatureLocationBond(f3, "good", 0, 5);
16565 
16566     CRef<CSeq_feat> f4 = unit_test_util::AddMiscFeature(entry);
16567     f4->SetData().SetBond();
16568     SetFeatureLocationBond(f4, "good", 0, 5);
16569 
16570     STANDARD_SETUP
16571     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "ImproperBondLocation",
16572                                 "Bond location should only be on bond features"));
16573     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "ImproperBondLocation",
16574                                 "Bond location should only be on bond features"));
16575     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "ImproperBondLocation",
16576                                 "Bond location should only be on bond features"));
16577     //AddChromosomeNoLocation(expected_errors, entry);
16578     eval = validator.Validate(seh, options);
16579     CheckErrors (*eval, expected_errors);
16580 
16581     CLEAR_ERRORS
16582 }
16583 
16584 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneXrefWithoutGene)16585 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneXrefWithoutGene)
16586 {
16587     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
16588     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature (entry);
16589     feat->SetGeneXref().SetLocus("missing");
16590 
16591     STANDARD_SETUP
16592 
16593     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "GeneXrefWithoutGene",
16594                                 "Feature has gene locus cross-reference but no equivalent gene feature exists"));
16595     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "OnlyGeneXrefs",
16596                                 "There are 1 gene xrefs and no gene features in this record."));
16597     //AddChromosomeNoLocation(expected_errors, entry);
16598     eval = validator.Validate(seh, options);
16599     CheckErrors (*eval, expected_errors);
16600     CLEAR_ERRORS
16601 
16602     feat->SetGeneXref().ResetLocus();
16603     feat->SetGeneXref().SetLocus_tag("missing");
16604 
16605     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "GeneXrefWithoutGene",
16606                                 "Feature has gene locus_tag cross-reference but no equivalent gene feature exists"));
16607     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "OnlyGeneXrefs",
16608                                 "There are 1 gene xrefs and no gene features in this record."));
16609     //AddChromosomeNoLocation(expected_errors, entry);
16610     eval = validator.Validate(seh, options);
16611     CheckErrors (*eval, expected_errors);
16612     CLEAR_ERRORS
16613 }
16614 
16615 
CreateReciprocalLinks(CSeq_feat & f1,CSeq_feat & f2)16616 void CreateReciprocalLinks(CSeq_feat& f1, CSeq_feat& f2)
16617 {
16618     f1.AddSeqFeatXref(f2.GetId());
16619     f2.AddSeqFeatXref(f1.GetId());
16620 }
16621 
16622 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SeqFeatXrefProblem)16623 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SeqFeatXrefProblem)
16624 {
16625     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
16626     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
16627     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
16628 
16629     // add ID to CDS
16630     cds->SetId().SetLocal().SetId(1);
16631 
16632     // create mRNA feature
16633     CRef<CSeq_feat> mrna = unit_test_util::MakemRNAForCDS(cds);
16634     mrna->SetId().SetLocal().SetId(2);
16635     unit_test_util::AddFeat (mrna, nuc);
16636 
16637     // create gene feature
16638     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature(cds);
16639     gene->SetId().SetLocal().SetId(3);
16640     unit_test_util::AddFeat (gene, nuc);
16641 
16642     // add misc_feature
16643     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(nuc);
16644     misc->SetId().SetLocal().SetId(4);
16645 
16646     STANDARD_SETUP
16647 
16648     // add broken SeqFeatXref to coding region
16649     CRef<CSeqFeatXref> x1(new CSeqFeatXref());
16650     cds->SetXref().push_back(x1);
16651 
16652     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefProblem",
16653                                 "SeqFeatXref with no id or data field"));
16654     //AddChromosomeNoLocation(expected_errors, entry);
16655     eval = validator.Validate(seh, options);
16656     CheckErrors (*eval, expected_errors);
16657     cds->ResetXref();
16658 
16659     CLEAR_ERRORS
16660 
16661     // xref between CDS and misc_feat is not allowed,
16662     // triggers error for non-ambiguous CDS/mRNA
16663     scope.RemoveTopLevelSeqEntry(seh);
16664     CreateReciprocalLinks(*cds, *misc);
16665     seh = scope.AddTopLevelSeqEntry(*entry);
16666 
16667 //    expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefNotReciprocal",
16668 //                                "CDS/mRNA unambiguous pair have erroneous cross-references"));
16669     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefProblem",
16670                                 "Cross-references are not between CDS and mRNA pair or between a gene and a CDS or mRNA (misc_feature,CDS)"));
16671     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefProblem",
16672                                 "Cross-references are not between CDS and mRNA pair or between a gene and a CDS or mRNA (CDS,misc_feature)"));
16673     //AddChromosomeNoLocation(expected_errors, entry);
16674     eval = validator.Validate(seh, options);
16675     CheckErrors (*eval, expected_errors);
16676 
16677     CLEAR_ERRORS
16678 
16679     // complain if linked-to feature has no xrefs of its own
16680     scope.RemoveTopLevelSeqEntry(seh);
16681     misc->ResetXref();
16682     seh = scope.AddTopLevelSeqEntry(*entry);
16683 //    expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefNotReciprocal",
16684 //                                "CDS/mRNA unambiguous pair have erroneous cross-references"));
16685     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefProblem",
16686                                 "Cross-referenced feature does not have its own cross-reference"));
16687     //AddChromosomeNoLocation(expected_errors, entry);
16688     eval = validator.Validate(seh, options);
16689     CheckErrors (*eval, expected_errors);
16690 
16691     cds->ResetXref();
16692 
16693     CLEAR_ERRORS
16694 
16695     // create xref between mRNA and coding region - this is allowed
16696     scope.RemoveTopLevelSeqEntry(seh);
16697     CreateReciprocalLinks(*cds, *mrna);
16698     seh = scope.AddTopLevelSeqEntry(*entry);
16699 
16700     //AddChromosomeNoLocation(expected_errors, entry);
16701     eval = validator.Validate(seh, options);
16702     CheckErrors (*eval, expected_errors);
16703 
16704     // create xref between coding region and gene - this is allowed
16705     scope.RemoveTopLevelSeqEntry(seh);
16706     CreateReciprocalLinks(*cds, *gene);
16707     seh = scope.AddTopLevelSeqEntry(*entry);
16708 
16709     eval = validator.Validate(seh, options);
16710     CheckErrors (*eval, expected_errors);
16711 
16712     // create xref between mRNA and gene - this is allowed
16713     scope.RemoveTopLevelSeqEntry(seh);
16714     CreateReciprocalLinks(*mrna, *gene);
16715     seh = scope.AddTopLevelSeqEntry(*entry);
16716 
16717     eval = validator.Validate(seh, options);
16718     CheckErrors (*eval, expected_errors);
16719 
16720     // shouldn't matter what order the links are created in
16721     scope.RemoveTopLevelSeqEntry(seh);
16722     mrna->ResetXref();
16723     cds->ResetXref();
16724     gene->ResetXref();
16725     CreateReciprocalLinks(*cds, *gene);
16726     CreateReciprocalLinks(*mrna, *gene);
16727     CreateReciprocalLinks(*cds, *mrna);
16728     seh = scope.AddTopLevelSeqEntry(*entry);
16729 
16730     eval = validator.Validate(seh, options);
16731     CheckErrors (*eval, expected_errors);
16732 
16733     // if feature has gene xref AND a feature ID xref to a gene feature,
16734     // they should not conflict
16735     scope.RemoveTopLevelSeqEntry(seh);
16736     CRef<CSeq_feat> other_gene = unit_test_util::AddMiscFeature(nuc);
16737     other_gene->SetData().SetGene().SetLocus("mismatch");
16738     // note that gene and other_gene cannot have the same location or will
16739     // trigger duplicate feature errors, gene xref gene should not be the
16740     // gene mapped to by overlap
16741     other_gene->SetLocation().Assign(gene->GetLocation());
16742     other_gene->SetLocation().SetInt().SetTo(other_gene->GetLocation().GetInt().GetTo() + 1);
16743     seh = scope.AddTopLevelSeqEntry(*entry);
16744 
16745     CRef<CSeqFeatXref> gene_xref(new CSeqFeatXref());
16746     gene_xref->SetData().SetGene().SetLocus("mismatch");
16747     cds->SetXref().push_back(gene_xref);
16748 
16749     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefProblem",
16750                                 "Feature gene xref does not match Feature ID cross-referenced gene feature"));
16751     eval = validator.Validate(seh, options);
16752     CheckErrors (*eval, expected_errors);
16753 
16754     CLEAR_ERRORS
16755 
16756     // ignore if gene xref and linked gene feature match
16757     scope.RemoveTopLevelSeqEntry(seh);
16758     gene_xref->SetData().SetGene().SetLocus("gene locus");
16759     other_gene->SetLocation().Assign(gene->GetLocation());
16760     gene->SetLocation().SetInt().SetTo(gene->GetLocation().GetInt().GetTo() + 1);
16761     seh = scope.AddTopLevelSeqEntry(*entry);
16762 
16763     //AddChromosomeNoLocation(expected_errors, entry);
16764     eval = validator.Validate(seh, options);
16765     CheckErrors (*eval, expected_errors);
16766 
16767     CLEAR_ERRORS
16768 }
16769 
16770 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MissingTrnaAA)16771 BOOST_AUTO_TEST_CASE (Test_SEQ_FEAT_MissingTrnaAA)
16772 {
16773     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
16774     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature (entry);
16775     feat->SetData().SetRna().SetType (CRNA_ref::eType_tRNA);
16776 
16777     STANDARD_SETUP
16778     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "MissingTrnaAA",
16779                                 "Missing encoded amino acid qualifier in tRNA"));
16780     //AddChromosomeNoLocation(expected_errors, entry);
16781 
16782     eval = validator.Validate(seh, options);
16783     CheckErrors (*eval, expected_errors);
16784 
16785     CLEAR_ERRORS
16786 }
16787 
16788 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CollidingFeatureIDs)16789 BOOST_AUTO_TEST_CASE (Test_SEQ_FEAT_CollidingFeatureIDs)
16790 {
16791     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
16792     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature (entry);
16793     feat->SetId().SetLocal().SetId(1);
16794     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature (feat);
16795     gene->SetId().SetLocal().SetId(1);
16796     unit_test_util::AddFeat (gene, entry);
16797 
16798     STANDARD_SETUP
16799     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Critical, "CollidingFeatureIDs",
16800                                 "Colliding feature ID 1"));
16801     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Critical, "CollidingFeatureIDs",
16802                                 "Colliding feature ID 1"));
16803     //AddChromosomeNoLocation(expected_errors, entry);
16804     eval = validator.Validate(seh, options);
16805     CheckErrors (*eval, expected_errors);
16806 
16807     CLEAR_ERRORS
16808 }
16809 
16810 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PolyAsignalNotRange)16811 BOOST_AUTO_TEST_CASE (Test_SEQ_FEAT_PolyAsignalNotRange)
16812 {
16813     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
16814     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature (entry);
16815     feat->SetData().SetImp().SetKey("polyA_signal");
16816     feat->SetLocation().SetInt().SetTo(0);
16817 
16818     STANDARD_SETUP
16819     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "PolyAsignalNotRange",
16820                                 "PolyA_signal should be a range"));
16821     //AddChromosomeNoLocation(expected_errors, entry);
16822     eval = validator.Validate(seh, options);
16823     CheckErrors (*eval, expected_errors);
16824 
16825     CLEAR_ERRORS
16826 }
16827 
16828 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_OldLocusTagMismtach)16829 BOOST_AUTO_TEST_CASE (Test_SEQ_FEAT_OldLocusTagMismtach)
16830 {
16831     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
16832 
16833     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
16834     feat->AddQualifier("old_locus_tag", "one value");
16835 
16836     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature (feat);
16837     gene->AddQualifier ("old_locus_tag", "another value");
16838     unit_test_util::AddFeat (gene, entry);
16839 
16840     STANDARD_SETUP
16841 
16842     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "OldLocusTagMismtach",
16843                                 "Old locus tag on feature (one value) does not match that on gene (another value)"));
16844     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Error, "OldLocusTagWithoutLocusTag",
16845                                 "old_locus_tag without inherited locus_tag"));
16846     //AddChromosomeNoLocation(expected_errors, entry);
16847     eval = validator.Validate(seh, options);
16848     CheckErrors (*eval, expected_errors);
16849 
16850     CLEAR_ERRORS
16851 }
16852 
16853 
MakeGoTerm(string text="something",string evidence="some evidence")16854 static CRef<CUser_field> MakeGoTerm (string text = "something", string evidence = "some evidence")
16855 {
16856     CRef<CUser_field> go_term (new CUser_field());
16857     go_term->SetLabel().SetStr("a go term");
16858 
16859     SetGoTermId(*go_term, "123");
16860 
16861     SetGoTermPMID(*go_term, 4);
16862 
16863     SetGoTermText(*go_term, text);
16864 
16865     AddGoTermEvidence(*go_term, evidence);
16866 
16867     return go_term;
16868 }
16869 
16870 
CheckGeneOntologyTermDuplicate(CRef<CSeq_feat> feat)16871 void CheckGeneOntologyTermDuplicate(CRef<CSeq_feat> feat)
16872 {
16873     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
16874     AddFeat(feat, entry);
16875 
16876     STANDARD_SETUP
16877 
16878     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Info, "DuplicateGeneOntologyTerm",
16879                                 "Duplicate GO term on feature"));
16880     //AddChromosomeNoLocation(expected_errors, entry);
16881     eval = validator.Validate(seh, options);
16882     CheckErrors (*eval, expected_errors);
16883 
16884     CLEAR_ERRORS
16885 
16886     BOOST_CHECK_EQUAL(CountProcessGoTerms(*feat), (size_t)2);
16887     RemoveDuplicateGoTerms(*feat);
16888     BOOST_CHECK_EQUAL(CountProcessGoTerms(*feat), (size_t)1);
16889     RemoveDuplicateGoTerms(*feat);
16890     BOOST_CHECK_EQUAL(CountProcessGoTerms(*feat), (size_t)1);
16891 
16892 }
16893 
16894 
CheckGeneOntologyTermNotDuplicate(CRef<CSeq_feat> feat)16895 void CheckGeneOntologyTermNotDuplicate(CRef<CSeq_feat> feat)
16896 {
16897     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
16898     AddFeat(feat, entry);
16899 
16900     STANDARD_SETUP
16901 
16902     eval = validator.Validate(seh, options);
16903     //AddChromosomeNoLocation(expected_errors, entry);
16904     CheckErrors (*eval, expected_errors);
16905 
16906     CLEAR_ERRORS
16907 
16908     BOOST_CHECK_EQUAL(CountProcessGoTerms(*feat), (size_t)2);
16909     RemoveDuplicateGoTerms(*feat);
16910     BOOST_CHECK_EQUAL(CountProcessGoTerms(*feat), (size_t)2);
16911 }
16912 
16913 
MakeGeneOntologyFeat(CRef<CUser_field> term1,CRef<CUser_field> term2)16914 CRef<CSeq_feat> MakeGeneOntologyFeat(CRef<CUser_field> term1, CRef<CUser_field> term2)
16915 {
16916     CRef<CSeq_feat> feat(new CSeq_feat());
16917     feat->SetLocation().SetInt().SetFrom(0);
16918     feat->SetLocation().SetInt().SetTo(10);
16919     feat->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
16920     feat->SetData().SetImp().SetKey("misc_feature");
16921     feat->SetComment("comment is required");
16922 
16923     AddProcessGoTerm(*feat, term1);
16924     AddProcessGoTerm(*feat, term2);
16925 
16926     return feat;
16927 }
16928 
16929 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateGeneOntologyTerm)16930 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateGeneOntologyTerm)
16931 {
16932 
16933     CRef<CUser_field> term1 = MakeGoTerm();
16934     CRef<CUser_field> term2 = MakeGoTerm();
16935     CRef<CSeq_feat> feat = MakeGeneOntologyFeat(term1, term2);
16936 
16937     CheckGeneOntologyTermDuplicate(feat);
16938 
16939     SetGoTermId(*term2, "234");
16940     feat = MakeGeneOntologyFeat(term1, term2);
16941     CheckGeneOntologyTermNotDuplicate(feat);
16942 
16943     term2 = MakeGoTerm();
16944     ClearGoTermEvidence(*term1);
16945     feat = MakeGeneOntologyFeat(term1, term2);
16946     CheckGeneOntologyTermNotDuplicate(feat);
16947 
16948     ClearGoTermEvidence(*term2);
16949     feat = MakeGeneOntologyFeat(term1, term2);
16950     CheckGeneOntologyTermDuplicate(feat);
16951 
16952     AddGoTermEvidence(*term1, "A");
16953     AddGoTermEvidence(*term1, "B");
16954 
16955     AddGoTermEvidence(*term2, "C");
16956     AddGoTermEvidence(*term2, "B");
16957     feat = MakeGeneOntologyFeat(term1, term2);
16958     CheckGeneOntologyTermNotDuplicate(feat);
16959 
16960     ClearGoTermEvidence(*term1);
16961     ClearGoTermEvidence(*term2);
16962     ClearGoTermPMID(*term2);
16963     feat = MakeGeneOntologyFeat(term1, term2);
16964     CheckGeneOntologyTermNotDuplicate(feat);
16965 
16966     ClearGoTermPMID(*term1);
16967     feat = MakeGeneOntologyFeat(term1, term2);
16968     CheckGeneOntologyTermDuplicate(feat);
16969 
16970 }
16971 
16972 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidInferenceValue)16973 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidInferenceValue)
16974 {
16975     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
16976     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
16977     feat->AddQualifier("inference", " ");
16978 
16979     STANDARD_SETUP
16980 
16981     feat->SetQual().front()->SetVal("bad");
16982     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "InvalidInferenceValue",
16983                                  "Inference qualifier problem - bad inference prefix (bad)"));
16984     //AddChromosomeNoLocation(expected_errors, entry);
16985     eval = validator.Validate(seh, options);
16986     CheckErrors (*eval, expected_errors);
16987 
16988     feat->SetQual().front()->SetVal("similar to sequence");
16989     expected_errors[0]->SetErrMsg("Inference qualifier problem - bad inference body (similar to sequence)");
16990     eval = validator.Validate(seh, options);
16991     CheckErrors (*eval, expected_errors);
16992 
16993     feat->SetQual().front()->SetVal("profile(same species): INSD:AY123456.1");
16994     expected_errors[0]->SetErrMsg("Inference qualifier problem - same species misused (profile(same species): INSD:AY123456.1)");
16995     eval = validator.Validate(seh, options);
16996     CheckErrors (*eval, expected_errors);
16997 
16998     feat->SetQual().front()->SetVal("similar to RNA sequence: INSD:AY123456.1 INSD:AY123457");
16999     expected_errors[0]->SetErrMsg("Inference qualifier problem - spaces in inference (similar to RNA sequence: INSD:AY123456.1 INSD:AY123457)");
17000     eval = validator.Validate(seh, options);
17001     CheckErrors (*eval, expected_errors);
17002 
17003     feat->SetQual().front()->SetVal("similar to RNA sequence: INSD:AY123456");
17004     expected_errors[0]->SetErrMsg("Inference qualifier problem - bad inference accession version (similar to RNA sequence: INSD:AY123456)");
17005     eval = validator.Validate(seh, options);
17006     CheckErrors (*eval, expected_errors);
17007 
17008     feat->SetQual().front()->SetVal("similar to RNA sequence: RefSeq:AY123456.1");
17009     expected_errors[0]->SetErrMsg("Inference qualifier problem - bad accession type (similar to RNA sequence: RefSeq:AY123456.1)");
17010     eval = validator.Validate(seh, options);
17011     CheckErrors (*eval, expected_errors);
17012 
17013     feat->SetQual().front()->SetVal("similar to RNA sequence: BLAST:AY123456.1");
17014     expected_errors[0]->SetErrMsg("Inference qualifier problem - bad accession type (similar to RNA sequence: BLAST:AY123456.1)");
17015     eval = validator.Validate(seh, options);
17016     CheckErrors (*eval, expected_errors);
17017 
17018     feat->SetQual().front()->SetVal("similar to AA sequence:RefSeq:gi|21240850|ref|NP_640432.1|");
17019     eval = validator.Validate(seh, options);
17020     expected_errors[0]->SetErrMsg("Inference qualifier problem - the value in the accession field is not legal. The only allowed value is accession.version, eg AF123456.1. Problem = (similar to AA sequence:RefSeq:gi|21240850|ref|NP_640432.1|)");
17021     CheckErrors(*eval, expected_errors);
17022 
17023 
17024     CLEAR_ERRORS
17025 
17026     // SRA inferences are ok
17027     feat->SetQual().front()->SetVal("similar to RNA sequence:INSD:ERP003431");
17028     //AddChromosomeNoLocation(expected_errors, entry);
17029     eval = validator.Validate(seh, options);
17030     CheckErrors (*eval, expected_errors);
17031 
17032     // GeneDB is ok for similar to
17033     feat->SetQual().front()->SetVal("similar to RNA sequence:GeneDB:LmjF.01.0090");
17034     eval = validator.Validate(seh, options);
17035     CheckErrors (*eval, expected_errors);
17036 
17037     CLEAR_ERRORS
17038 }
17039 
17040 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_HypotheticalProteinMismatch)17041 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_HypotheticalProteinMismatch) {
17042     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
17043 
17044     CRef<CSeq_id> protid(new CSeq_id());
17045     protid->SetOther().SetAccession("XP_654321");
17046     unit_test_util::ChangeProtId (entry, protid);
17047     CRef<CSeq_feat> prot = unit_test_util::GetProtFeatFromGoodNucProtSet(entry);
17048     prot->SetData().SetProt().ResetName();
17049     prot->SetData().SetProt().SetName().push_back("hypothetical protein XP_123");
17050 
17051     STANDARD_SETUP
17052 
17053     expected_errors.push_back (new CExpectedError("ref|XP_654321|", eDiag_Warning, "HypotheticalProteinMismatch",
17054                                "Hypothetical protein reference does not match accession"));
17055     //AddChromosomeNoLocation(expected_errors, entry);
17056     eval = validator.Validate(seh, options);
17057     CheckErrors (*eval, expected_errors);
17058 
17059     CLEAR_ERRORS
17060 }
17061 
17062 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SelfReferentialProduct)17063 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SelfReferentialProduct)
17064 {
17065     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
17066     CRef<CSeq_feat> cds = unit_test_util::AddMiscFeature(entry);
17067     cds->SetData().SetCdregion();
17068     cds->SetLocation().SetInt().SetTo(59);
17069     cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
17070     cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
17071     cds->SetPartial(true);
17072     cds->SetProduct().SetWhole().Assign(*(entry->SetSeq().SetId().front()));
17073 
17074     STANDARD_SETUP
17075 
17076     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Error, "SelfReferentialProduct",
17077                                "Self-referential feature product"));
17078     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "PartialsInconsistent",
17079                                "Inconsistent: Product= complete, Location= partial, Feature.partial= TRUE"));
17080     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Error, "CDSproductPackagingProblem",
17081                                "Protein product not packaged in nuc-prot set with nucleotide"));
17082     //AddChromosomeNoLocation(expected_errors, entry);
17083     eval = validator.Validate(seh, options);
17084     CheckErrors (*eval, expected_errors);
17085 
17086     CLEAR_ERRORS
17087 }
17088 
17089 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ITSdoesNotAbutRRNA)17090 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ITSdoesNotAbutRRNA)
17091 {
17092     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
17093     CRef<CSeq_feat> rrna = unit_test_util::AddMiscFeature (entry);
17094     rrna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
17095     rrna->SetData().SetRna().SetExt().SetName("18s ribosomal subunit");
17096 
17097     CRef<CSeq_feat> its = unit_test_util::AddMiscFeature (entry);
17098     its->SetData().SetRna().SetType(CRNA_ref::eType_miscRNA);
17099     its->SetData().SetRna().SetExt().SetName("internal transcribed spacer 1");
17100     its->SetLocation().SetInt().SetFrom(rrna->GetLocation().GetInt().GetTo() + 2);
17101     its->SetLocation().SetInt().SetTo(rrna->GetLocation().GetInt().GetTo() + 12);
17102 
17103     STANDARD_SETUP
17104 
17105     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "ITSdoesNotAbutRRNA",
17106                                "ITS does not abut adjacent rRNA component"));
17107     //AddChromosomeNoLocation(expected_errors, entry);
17108     eval = validator.Validate(seh, options);
17109     CheckErrors (*eval, expected_errors);
17110 
17111     scope.RemoveTopLevelSeqEntry(seh);
17112     unit_test_util::RevComp (entry);
17113     seh = scope.AddTopLevelSeqEntry(*entry);
17114     eval = validator.Validate(seh, options);
17115     CheckErrors (*eval, expected_errors);
17116 
17117     rrna->SetData().SetRna().SetExt().SetName("5.8S ribosomal subunit");
17118     its->SetData().SetRna().SetExt().SetName("internal transcribed spacer 2");
17119     eval = validator.Validate(seh, options);
17120     CheckErrors (*eval, expected_errors);
17121 
17122     scope.RemoveTopLevelSeqEntry(seh);
17123     unit_test_util::RevComp (entry);
17124     seh = scope.AddTopLevelSeqEntry(*entry);
17125     eval = validator.Validate(seh, options);
17126     CheckErrors (*eval, expected_errors);
17127 
17128     CLEAR_ERRORS
17129 }
17130 
17131 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureSeqIDCaseDifference)17132 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureSeqIDCaseDifference)
17133 {
17134     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
17135     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
17136     feat->SetLocation().SetInt().SetId().SetLocal().SetStr("Good");
17137     STANDARD_SETUP
17138 
17139     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Error, "FeatureSeqIDCaseDifference",
17140                                "Sequence identifier in feature location differs in capitalization with identifier on Bioseq"));
17141     //AddChromosomeNoLocation(expected_errors, entry);
17142     eval = validator.Validate(seh, options);
17143     CheckErrors (*eval, expected_errors);
17144 
17145     CLEAR_ERRORS
17146 }
17147 
17148 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureLocationIsGi0)17149 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureLocationIsGi0)
17150 {
17151     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
17152     entry->SetSeq().SetId().front()->SetGi(ZERO_GI);
17153     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
17154 
17155     STANDARD_SETUP
17156 
17157     expected_errors.push_back (new CExpectedError("gi|0", eDiag_Critical, "ZeroGiNumber",
17158                                "Invalid GI number"));
17159     expected_errors.push_back (new CExpectedError("gi|0", eDiag_Error, "GiWithoutAccession",
17160                                "No accession on sequence with gi number"));
17161     expected_errors.push_back (new CExpectedError("gi|0", eDiag_Critical, "FeatureLocationIsGi0",
17162                                "Feature has 1 gi|0 location on Bioseq gi|0"));
17163     //AddChromosomeNoLocation(expected_errors, entry);
17164     eval = validator.Validate(seh, options);
17165     CheckErrors (*eval, expected_errors);
17166 
17167     CLEAR_ERRORS
17168 }
17169 
17170 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GapFeatureProblem)17171 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GapFeatureProblem)
17172 {
17173     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
17174     entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("CNCATGATGATG");
17175 
17176     CRef<CSeq_feat> gap = unit_test_util::AddMiscFeature(entry);
17177     gap->SetData().SetImp().SetKey("gap");
17178     gap->AddQualifier("estimated_length", "11");
17179 
17180     STANDARD_SETUP
17181 
17182     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Error, "GapFeatureProblem",
17183                                "Gap feature over 11 real bases"));
17184     //AddChromosomeNoLocation(expected_errors, entry);
17185     eval = validator.Validate(seh, options);
17186     CheckErrors (*eval, expected_errors);
17187 
17188     gap->SetLocation().SetInt().SetFrom(10);
17189     gap->SetLocation().SetInt().SetTo(20);
17190     expected_errors[0]->SetErrMsg("Gap feature over 2 real bases");
17191     eval = validator.Validate(seh, options);
17192     CheckErrors (*eval, expected_errors);
17193 
17194     gap->SetLocation().SetInt().SetFrom(20);
17195     gap->SetLocation().SetInt().SetTo(30);
17196     expected_errors[0]->SetErrMsg("Gap feature over 8 real bases and 1 Ns");
17197     eval = validator.Validate(seh, options);
17198     CheckErrors (*eval, expected_errors);
17199 
17200     gap->SetLocation().SetInt().SetFrom(12);
17201     gap->SetLocation().SetInt().SetTo(21);
17202     expected_errors[0]->SetErrMsg("Gap feature estimated_length 11 does not match 10 feature length");
17203     eval = validator.Validate(seh, options);
17204     CheckErrors (*eval, expected_errors);
17205 
17206     CLEAR_ERRORS
17207 }
17208 
17209 
BuildGenProdSetBigNucProtSet(CRef<CSeq_id> nuc_id,CRef<CSeq_id> prot_id)17210 static CRef<CSeq_entry> BuildGenProdSetBigNucProtSet (CRef<CSeq_id> nuc_id, CRef<CSeq_id> prot_id)
17211 {
17212     CRef<CSeq_entry> np = unit_test_util::BuildGoodNucProtSet();
17213     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(np);
17214     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATAA");
17215     nuc->SetSeq().SetInst().SetLength(366);
17216     nuc->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
17217     unit_test_util::SetBiomol(nuc, CMolInfo::eBiomol_mRNA);
17218     CRef<CSeq_entry> prot = unit_test_util::GetProteinSequenceFromGoodNucProtSet(np);
17219     prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MFFFFFFFFFFPPPPPPPPPPGGGGGGGGGGKKKKKKKKKKFFFFFFFFFFPPPPPPPPPPGGGGGGGGGGKKKKKKKKKKFFFFFFFFFFPPPPPPPPPPGGGGGGGGGGKKKKKKKKKK");
17220     prot->SetSeq().SetInst().SetLength(121);
17221     unit_test_util::AdjustProtFeatForNucProtSet (np);
17222     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(np);
17223     cds->SetLocation().SetInt().SetFrom(0);
17224     cds->SetLocation().SetInt().SetTo(nuc->GetSeq().GetInst().GetLength()-1);
17225     if (nuc_id) {
17226         unit_test_util::ChangeNucProtSetNucId(np, nuc_id);
17227     }
17228     if (prot_id) {
17229         unit_test_util::ChangeNucProtSetProteinId(np, prot_id);
17230     }
17231     return np;
17232 }
17233 
17234 
BuildGenProdSetWithBigProduct()17235 static CRef<CSeq_entry> BuildGenProdSetWithBigProduct()
17236 {
17237     CRef<CSeq_entry> entry(new CSeq_entry());
17238     entry->SetSet().SetClass(CBioseq_set::eClass_gen_prod_set);
17239     CRef<CSeq_entry> contig = unit_test_util::BuildGoodSeq();
17240     contig->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATAAGGGCCCTTT");
17241     contig->SetSeq().SetInst().SetLength(375);
17242     entry->SetSet().SetSeq_set().push_back (contig);
17243     CRef<CSeq_id> nuc_id(new CSeq_id());
17244     nuc_id->SetLocal().SetStr("nuc");
17245     CRef<CSeq_id> prot_id(new CSeq_id());
17246     prot_id->SetLocal().SetStr("prot");
17247     CRef<CSeq_entry> np = BuildGenProdSetBigNucProtSet(nuc_id, prot_id);
17248     entry->SetSet().SetSeq_set().push_back (np);
17249 
17250     CRef<CSeq_feat> cds(new CSeq_feat());
17251     cds->Assign (*(unit_test_util::GetCDSFromGoodNucProtSet(np)));
17252     cds->SetLocation().SetInt().SetId().SetLocal().SetStr("good");
17253     unit_test_util::AddFeat (cds, contig);
17254     CRef<CSeq_feat> mrna = unit_test_util::MakemRNAForCDS(cds);
17255     mrna->SetProduct().SetWhole().Assign(*nuc_id);
17256     unit_test_util::AddFeat (mrna, contig);
17257 
17258     return entry;
17259 }
17260 
17261 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ErroneousException)17262 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ErroneousException)
17263 {
17264     CRef<CSeq_entry> entry = BuildGenProdSetWithBigProduct();
17265     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGenProdSet (entry);
17266     cds->SetExcept(true);
17267     cds->SetExcept_text("unclassified translation discrepancy");
17268     CRef<CSeq_feat> mrna = unit_test_util::GetmRNAFromGenProdSet(entry);
17269     mrna->SetExcept(true);
17270     mrna->SetExcept_text("unclassified transcription discrepancy");
17271     CRef<CSeq_entry> genomic = unit_test_util::GetGenomicFromGenProdSet(entry);
17272     genomic->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGTTTCTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATAAGGGCCCTTT");
17273 
17274     STANDARD_SETUP
17275 
17276     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "ErroneousException",
17277                                "CDS has unclassified exception but only difference is 1 mismatches out of 121 residues"));
17278     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "ErroneousException",
17279                                "mRNA has unclassified exception but only difference is 1 mismatches out of 366 bases"));
17280     //AddChromosomeNoLocation(expected_errors, entry);
17281     eval = validator.Validate(seh, options);
17282     CheckErrors (*eval, expected_errors);
17283 
17284     CLEAR_ERRORS
17285 }
17286 
17287 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_WholeLocation)17288 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_WholeLocation)
17289 {
17290     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
17291     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
17292     misc->SetLocation().SetWhole().Assign(*(entry->SetSeq().SetId().front()));
17293     CRef<CSeq_feat> cds = unit_test_util::AddMiscFeature(entry);
17294     cds->SetData().SetCdregion();
17295     cds->SetLocation().SetWhole().Assign(*(entry->SetSeq().SetId().front()));
17296     cds->SetPseudo(true);
17297 
17298     CRef<CSeq_feat> mrna = unit_test_util::AddMiscFeature(entry);
17299     mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
17300     mrna->SetLocation().SetWhole().Assign(*(entry->SetSeq().SetId().front()));
17301     mrna->SetPseudo(true);
17302 
17303 
17304     STANDARD_SETUP
17305 
17306     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "WholeLocation",
17307                                "Feature may not have whole location"));
17308     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "WholeLocation",
17309                                "CDS may not have whole location"));
17310     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "WholeLocation",
17311                                "mRNA may not have whole location"));
17312     //AddChromosomeNoLocation(expected_errors, entry);
17313     eval = validator.Validate(seh, options);
17314     CheckErrors (*eval, expected_errors);
17315 
17316     CLEAR_ERRORS
17317 }
17318 
17319 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_EcNumberProblem)17320 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_EcNumberProblem)
17321 {
17322     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
17323     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
17324     cds->SetComment("EC:1.1.1.10");
17325     CRef<CSeq_feat> prot = unit_test_util::GetProtFeatFromGoodNucProtSet(entry);
17326     prot->SetData().SetProt().SetName().front().append("; EC:1.1.1.10");
17327     prot->SetComment("EC:1.1.1.10");
17328     prot->SetData().SetProt().SetEc().push_back("");
17329 
17330     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
17331     CRef<CSeq_feat> exon = unit_test_util::AddMiscFeature(nuc);
17332     exon->SetData().SetImp().SetKey("exon");
17333     exon->AddQualifier("EC_number", "");
17334 
17335     STANDARD_SETUP
17336 
17337     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "InvalidPunctuation",
17338                                "Qualifier other than replace has just quotation marks"));
17339     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "EcNumberEmpty",
17340                                "EC number should not be empty"));
17341     expected_errors.push_back (new CExpectedError("lcl|prot", eDiag_Warning, "EcNumberInProteinName",
17342                                "Apparent EC number in protein title"));
17343     expected_errors.push_back (new CExpectedError("lcl|prot", eDiag_Warning, "EcNumberInProteinComment",
17344                                "Apparent EC number in protein comment"));
17345     expected_errors.push_back (new CExpectedError("lcl|prot", eDiag_Warning, "EcNumberEmpty",
17346                                "EC number should not be empty"));
17347     //AddChromosomeNoLocation(expected_errors, entry);
17348     eval = validator.Validate(seh, options);
17349     CheckErrors (*eval, expected_errors);
17350 
17351     CLEAR_ERRORS
17352 
17353     prot->SetData().SetProt().ResetEc();
17354     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "InvalidPunctuation",
17355                                "Qualifier other than replace has just quotation marks"));
17356     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "EcNumberEmpty",
17357                                "EC number should not be empty"));
17358     expected_errors.push_back (new CExpectedError("lcl|prot", eDiag_Warning, "EcNumberInProteinName",
17359                                "Apparent EC number in protein title"));
17360     expected_errors.push_back (new CExpectedError("lcl|prot", eDiag_Warning, "EcNumberInProteinComment",
17361                                "Apparent EC number in protein comment"));
17362     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Info, "EcNumberInCDSComment",
17363                                "Apparent EC number in CDS comment"));
17364     //AddChromosomeNoLocation(expected_errors, entry);
17365 
17366     eval = validator.Validate(seh, options);
17367     CheckErrors (*eval, expected_errors);
17368 
17369     CLEAR_ERRORS
17370 }
17371 
17372 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_VectorContamination)17373 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_VectorContamination)
17374 {
17375     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
17376     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
17377     misc->AddQualifier("standard_name", "Vector Contamination");
17378 
17379     STANDARD_SETUP
17380 
17381     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "VectorContamination",
17382                                "Vector Contamination region should be trimmed from sequence"));
17383     //AddChromosomeNoLocation(expected_errors, entry);
17384 
17385     eval = validator.Validate(seh, options);
17386     CheckErrors (*eval, expected_errors);
17387 
17388     CLEAR_ERRORS
17389 }
17390 
17391 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MinusStrandProtein)17392 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MinusStrandProtein)
17393 {
17394     CRef<CSeq_entry> entry = unit_test_util::BuildGoodProtSeq();
17395     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
17396     misc->SetLocation().SetInt().SetStrand(eNa_strand_minus);
17397     misc->SetLocation().SetInt().SetTo(5);
17398 
17399     STANDARD_SETUP
17400 
17401     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "MinusStrandProtein",
17402                                "Feature on protein indicates negative strand"));
17403     //AddChromosomeNoLocation(expected_errors, entry);
17404 
17405     eval = validator.Validate(seh, options);
17406     CheckErrors (*eval, expected_errors);
17407 
17408     CLEAR_ERRORS
17409 }
17410 
17411 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadProteinName)17412 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadProteinName)
17413 {
17414     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
17415     CRef<CSeq_feat> prot = unit_test_util::GetProtFeatFromGoodNucProtSet(entry);
17416     prot->SetData().SetProt().ResetName();
17417     prot->SetData().SetProt().SetName().push_back("Hypothetical protein");
17418     prot->SetData().SetProt().SetEc().push_back("1.1.1.20");
17419 
17420     STANDARD_SETUP
17421 
17422     expected_errors.push_back (new CExpectedError("lcl|prot", eDiag_Warning, "BadProteinName",
17423                                "Unknown or hypothetical protein should not have EC number"));
17424     //AddChromosomeNoLocation(expected_errors, entry);
17425     eval = validator.Validate(seh, options);
17426     CheckErrors (*eval, expected_errors);
17427 
17428     prot->SetData().SetProt().ResetName();
17429     prot->SetData().SetProt().SetName().push_back("hypothetical protein");
17430     eval = validator.Validate(seh, options);
17431     CheckErrors (*eval, expected_errors);
17432 
17433     prot->SetData().SetProt().ResetName();
17434     prot->SetData().SetProt().SetName().push_back("Unknown protein");
17435     eval = validator.Validate(seh, options);
17436     CheckErrors (*eval, expected_errors);
17437 
17438     prot->SetData().SetProt().ResetName();
17439     prot->SetData().SetProt().SetName().push_back("unknown protein");
17440     eval = validator.Validate(seh, options);
17441     CheckErrors (*eval, expected_errors);
17442 
17443     CLEAR_ERRORS
17444 }
17445 
17446 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneXrefWithoutLocus)17447 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneXrefWithoutLocus)
17448 {
17449     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
17450     CRef<CSeq_feat>  misc = unit_test_util::AddMiscFeature(entry);
17451     CRef<CSeq_feat> gene1 = unit_test_util::MakeGeneForFeature (misc);
17452     unit_test_util::AddFeat(gene1, entry);
17453     CRef<CSeq_feat> gene2 = unit_test_util::MakeGeneForFeature (misc);
17454     gene2->SetData().SetGene().SetLocus_tag("locus_tag");
17455     gene2->SetData().SetGene().SetLocus ("second locus");
17456     gene2->SetLocation().SetInt().SetTo(misc->GetLocation().GetInt().GetTo() + 5);
17457     unit_test_util::AddFeat(gene2, entry);
17458     CRef<CSeqFeatXref> x(new CSeqFeatXref());
17459     x->SetData().SetGene().SetLocus_tag("locus_tag");
17460     misc->SetXref().push_back(x);
17461 
17462     STANDARD_SETUP
17463 
17464     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "GeneXrefWithoutLocus",
17465                                "Feature has Gene Xref with locus_tag but no locus, gene with locus_tag and locus exists"));
17466     //AddChromosomeNoLocation(expected_errors, entry);
17467     eval = validator.Validate(seh, options);
17468     CheckErrors (*eval, expected_errors);
17469 
17470     CLEAR_ERRORS
17471 }
17472 
17473 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UTRdoesNotExtendToEnd)17474 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UTRdoesNotExtendToEnd)
17475 {
17476     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
17477     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
17478     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATAAACTAAAAAGGGAAA");
17479     nuc->SetSeq().SetInst().SetLength(36);
17480     nuc->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
17481     unit_test_util::SetBiomol(nuc, CMolInfo::eBiomol_mRNA);
17482     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
17483     CRef<CSeq_feat> utr3 = unit_test_util::AddMiscFeature(nuc);
17484     utr3->SetData().SetImp().SetKey("3'UTR");
17485     utr3->SetLocation().SetInt().SetFrom(cds->GetLocation().GetInt().GetTo() + 1);
17486     utr3->SetLocation().SetInt().SetTo(30);
17487 
17488     STANDARD_SETUP
17489 
17490     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "UTRdoesNotExtendToEnd",
17491                                "3'UTR does not extend to end of mRNA"));
17492     //AddChromosomeNoLocation(expected_errors, entry);
17493     eval = validator.Validate(seh, options);
17494     CheckErrors (*eval, expected_errors);
17495 
17496     CLEAR_ERRORS
17497 }
17498 
17499 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDShasTooManyXs)17500 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDShasTooManyXs)
17501 {
17502     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
17503     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
17504     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGNNNNNNNNNNNNNNNATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
17505     CRef<CSeq_entry> prot = unit_test_util::GetProteinSequenceFromGoodNucProtSet(entry);
17506     prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MXXXXXIN");
17507 
17508     STANDARD_SETUP
17509 
17510     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Info, "FeatureIsMostlyNs",
17511                                "Feature contains more than 50% Ns"));
17512     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Info, "CDShasTooManyXs",
17513                                "CDS translation consists of more than 50% X residues"));
17514     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Info, "HighNpercent5Prime",
17515         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
17516     //AddChromosomeNoLocation(expected_errors, entry);
17517 
17518     eval = validator.Validate(seh, options);
17519     CheckErrors (*eval, expected_errors);
17520 
17521     CLEAR_ERRORS
17522 }
17523 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SuspiciousFrame)17524 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SuspiciousFrame)
17525 {
17526     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
17527     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
17528     cds->SetData().SetCdregion().SetFrame(CCdregion::eFrame_two);
17529     cds->SetLocation().SetInt().SetTo(21);
17530 
17531     STANDARD_SETUP
17532     string tmp;
17533     CSeqTranslator::Translate(*cds, scope, tmp, false, false);
17534     scope.RemoveTopLevelSeqEntry(seh);
17535     CRef<CSeq_entry> prot = unit_test_util::GetProteinSequenceFromGoodNucProtSet(entry);
17536     prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set(tmp);
17537     prot->SetSeq().SetInst().SetLength(tmp.length());
17538     unit_test_util::AdjustProtFeatForNucProtSet (entry);
17539     CRef<CSeq_feat> prot_feat = unit_test_util::GetProtFeatFromGoodNucProtSet(entry);
17540     seh = scope.AddTopLevelSeqEntry(*entry);
17541 
17542     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Error, "SuspiciousFrame",
17543                                "Suspicious CDS location - reading frame > 1 but not 5' partial"));
17544     //AddChromosomeNoLocation(expected_errors, entry);
17545     eval = validator.Validate(seh, options);
17546     CheckErrors (*eval, expected_errors);
17547 
17548     cds->SetData().SetCdregion().SetFrame(CCdregion::eFrame_three);
17549     cds->SetLocation().SetInt().SetFrom(1);
17550     cds->SetLocation().SetInt().SetTo(26);
17551     cds->SetLocation().SetPartialStart(true, eExtreme_Biological);
17552     cds->SetPartial(true);
17553     tmp.clear();
17554     CSeqTranslator::Translate(*cds, scope, tmp, false, false);
17555     scope.RemoveTopLevelSeqEntry(seh);
17556     prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set(tmp);
17557     prot->SetSeq().SetInst().SetLength(tmp.length());
17558     unit_test_util::AdjustProtFeatForNucProtSet (entry);
17559     unit_test_util::SetCompleteness (prot, CMolInfo::eCompleteness_no_left);
17560     prot_feat->SetLocation().SetPartialStart(true, eExtreme_Biological);
17561     prot_feat->SetPartial(true);
17562     seh = scope.AddTopLevelSeqEntry(*entry);
17563     CLEAR_ERRORS
17564 
17565     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus5Prime",
17566                                "5' partial is not at beginning of sequence, gap, or consensus splice site"));
17567     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "SuspiciousFrame",
17568                                "Suspicious CDS location - reading frame > 1 and not at consensus splice site"));
17569     //AddChromosomeNoLocation(expected_errors, entry);
17570     eval = validator.Validate(seh, options);
17571     CheckErrors (*eval, expected_errors);
17572 
17573     CLEAR_ERRORS
17574 }
17575 
17576 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TerminalXDiscrepancy)17577 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TerminalXDiscrepancy)
17578 {
17579     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
17580     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
17581     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACNAAGGG");
17582     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
17583     cds->SetPartial(true);
17584     cds->SetLocation().SetPartialStop(true, eExtreme_Biological);
17585     cds->SetLocation().SetInt().SetFrom(30);
17586     cds->SetLocation().SetInt().SetTo(nuc->GetSeq().GetInst().GetLength() - 1);
17587     CRef<CSeq_entry> prot = unit_test_util::GetProteinSequenceFromGoodNucProtSet(entry);
17588     prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MPRKTEINXX");
17589     prot->SetSeq().SetInst().SetLength(10);
17590     unit_test_util::SetCompleteness (prot, CMolInfo::eCompleteness_no_right);
17591     CRef<CSeq_feat> prot_feat = unit_test_util::GetProtFeatFromGoodNucProtSet(entry);
17592     unit_test_util::AdjustProtFeatForNucProtSet (entry);
17593     prot_feat->SetPartial(true);
17594     prot_feat->SetLocation().SetPartialStop(true, eExtreme_Biological);
17595 
17596     STANDARD_SETUP
17597     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Error, "TransLen",
17598                                "Given protein length [8] does not match translation length [10]"));
17599     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "TerminalXDiscrepancy",
17600                                "Terminal X count for CDS translation (0) and protein product sequence (2) are not equal"));
17601     //AddChromosomeNoLocation(expected_errors, entry);
17602     eval = validator.Validate(seh, options);
17603     CheckErrors (*eval, expected_errors);
17604 
17605     CLEAR_ERRORS
17606 }
17607 
17608 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnnecessaryTranslExcept)17609 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnnecessaryTranslExcept)
17610 {
17611     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
17612     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
17613     CRef<CCode_break> codebreak(new CCode_break());
17614     codebreak->SetLoc().SetInt().SetId().SetLocal().SetStr("nuc");
17615     codebreak->SetLoc().SetInt().SetFrom(3);
17616     codebreak->SetLoc().SetInt().SetTo(5);
17617     codebreak->SetAa().SetNcbieaa('P');
17618     cds->SetData().SetCdregion().SetCode_break().push_back(codebreak);
17619 
17620     STANDARD_SETUP
17621 
17622     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryTranslExcept",
17623                                "Unnecessary transl_except P at position 2"));
17624     //AddChromosomeNoLocation(expected_errors, entry);
17625     eval = validator.Validate(seh, options);
17626     CheckErrors (*eval, expected_errors);
17627 
17628     CLEAR_ERRORS
17629     codebreak->SetLoc().SetInt().SetFrom(0);
17630     codebreak->SetLoc().SetInt().SetTo(2);
17631     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "TranslExcept",
17632                                "Suspicious transl_except P at first codon of complete CDS"));
17633     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Error, "MisMatchAA",
17634                                "Residue 1 in protein [M] != translation [P] at lcl|nuc:1-3"));
17635     //AddChromosomeNoLocation(expected_errors, entry);
17636     eval = validator.Validate(seh, options);
17637     CheckErrors (*eval, expected_errors);
17638 
17639     CLEAR_ERRORS
17640 }
17641 
17642 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidMatchingReplace)17643 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidMatchingReplace)
17644 {
17645     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
17646     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
17647     feat->SetData().SetImp().SetKey("misc_difference");
17648     feat->AddQualifier("replace", "aattggccaaa");
17649 
17650     STANDARD_SETUP
17651 
17652     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Info, "InvalidMatchingReplace",
17653                                "/replace already matches underlying sequence (aattggccaaa)"));
17654     //AddChromosomeNoLocation(expected_errors, entry);
17655     eval = validator.Validate(seh, options);
17656     CheckErrors (*eval, expected_errors);
17657     CLEAR_ERRORS
17658 }
17659 
17660 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NotSpliceConsensusDonor)17661 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NotSpliceConsensusDonor)
17662 {
17663     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
17664     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
17665     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
17666     cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc->SetSeq().SetId().front()));
17667     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = 'A';
17668     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = 'G';
17669     CRef<CSeq_feat> intron = unit_test_util::MakeIntronForMixLoc(nuc->SetSeq().SetId().front());
17670     unit_test_util::AddFeat(intron, nuc);
17671 
17672     STANDARD_SETUP
17673 
17674     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor",
17675                                "Splice donor consensus (GT) not found at start of intron, position 17 of lcl|nuc"));
17676     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor",
17677                                "Splice donor consensus (GT) not found after exon ending at position 16 of lcl|nuc"));
17678     //AddChromosomeNoLocation(expected_errors, entry);
17679     eval = validator.Validate(seh, options);
17680     CheckErrors (*eval, expected_errors);
17681 
17682     scope.RemoveTopLevelSeqEntry(seh);
17683     unit_test_util::RevComp(entry);
17684     seh = scope.AddTopLevelSeqEntry(*entry);
17685     expected_errors[0]->SetErrMsg("Splice donor consensus (GT) not found at start of intron, position 44 of lcl|nuc");
17686     expected_errors[1]->SetErrMsg("Splice donor consensus (GT) not found after exon ending at position 45 of lcl|nuc");
17687     eval = validator.Validate(seh, options);
17688     CheckErrors (*eval, expected_errors);
17689 
17690     scope.RemoveTopLevelSeqEntry(seh);
17691     unit_test_util::RevComp(entry);
17692     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[16] = '\xFB';
17693     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[17] = '\xFB';
17694     seh = scope.AddTopLevelSeqEntry(*entry);
17695     CLEAR_ERRORS
17696     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [17]"));
17697     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [18]"));
17698     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor", "Splice donor consensus (GT) not found at start of intron, position 17 of lcl|nuc"));
17699     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor", "Bad sequence at splice donor after exon ending at position 16 of lcl|nuc"));
17700     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Fatal, "NonAsciiAsn", "Non-ASCII character '251' found in item"));
17701     //AddChromosomeNoLocation(expected_errors, entry);
17702     eval = validator.Validate(seh, options);
17703     CheckErrors (*eval, expected_errors);
17704 
17705     scope.RemoveTopLevelSeqEntry(seh);
17706     unit_test_util::RevComp(entry);
17707     seh = scope.AddTopLevelSeqEntry(*entry);
17708     CLEAR_ERRORS
17709     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [43]"));
17710     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue", "Invalid residue [251] at position [44]"));
17711     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor", "Splice donor consensus (GT) not found at start of intron, position 44 of lcl|nuc"));
17712     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor", "Bad sequence at splice donor after exon ending at position 45 of lcl|nuc"));
17713     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Fatal, "NonAsciiAsn", "Non-ASCII character '251' found in item"));
17714     //AddChromosomeNoLocation(expected_errors, entry);
17715 
17716     eval = validator.Validate(seh, options);
17717     CheckErrors (*eval, expected_errors);
17718 
17719     CLEAR_ERRORS
17720 
17721     scope.RemoveTopLevelSeqEntry(seh);
17722     entry = unit_test_util::BuildGoodSeq();
17723     intron = unit_test_util::AddMiscFeature(entry);
17724     intron->SetData().SetImp().SetKey("intron");
17725     seh = scope.AddTopLevelSeqEntry(*entry);
17726     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Info, "NotSpliceConsensusDonorTerminalIntron",
17727                                "Splice donor consensus (GT) not found at start of terminal intron, position 1 of lcl|good"));
17728     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
17729                                "Splice acceptor consensus (AG) not found at end of intron, position 11 of lcl|good"));
17730     //AddChromosomeNoLocation(expected_errors, entry);
17731     eval = validator.Validate(seh, options);
17732     CheckErrors (*eval, expected_errors);
17733 
17734     scope.RemoveTopLevelSeqEntry(seh);
17735     unit_test_util::RevComp(entry);
17736     seh = scope.AddTopLevelSeqEntry(*entry);
17737     CLEAR_ERRORS
17738     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Info, "NotSpliceConsensusDonorTerminalIntron",
17739                                "Splice donor consensus (GT) not found at start of terminal intron, position 60 of lcl|good"));
17740     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
17741                                "Splice acceptor consensus (AG) not found at end of intron, position 50 of lcl|good"));
17742     //AddChromosomeNoLocation(expected_errors, entry);
17743     eval = validator.Validate(seh, options);
17744     CheckErrors (*eval, expected_errors);
17745 
17746     CLEAR_ERRORS
17747 }
17748 
17749 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NotSpliceConsensusAcceptor)17750 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NotSpliceConsensusAcceptor)
17751 {
17752     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
17753     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
17754     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
17755     cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc->SetSeq().SetId().front()));
17756     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[16] = 'G';
17757     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[17] = 'T';
17758     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = 'T';
17759     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = 'C';
17760     CRef<CSeq_feat> intron = unit_test_util::MakeIntronForMixLoc(nuc->SetSeq().SetId().front());
17761     unit_test_util::AddFeat(intron, nuc);
17762 
17763     STANDARD_SETUP
17764 
17765     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor",
17766                                "Splice acceptor consensus (AG) not found at end of intron, position 46 of lcl|nuc"));
17767     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor",
17768                                "Splice acceptor consensus (AG) not found before exon starting at position 47 of lcl|nuc"));
17769     //AddChromosomeNoLocation(expected_errors, entry);
17770     eval = validator.Validate(seh, options);
17771     CheckErrors (*eval, expected_errors);
17772 
17773     scope.RemoveTopLevelSeqEntry(seh);
17774     unit_test_util::RevComp(entry);
17775     seh = scope.AddTopLevelSeqEntry(*entry);
17776     expected_errors[0]->SetErrMsg("Splice acceptor consensus (AG) not found at end of intron, position 15 of lcl|nuc");
17777     expected_errors[1]->SetErrMsg("Splice acceptor consensus (AG) not found before exon starting at position 14 of lcl|nuc");
17778     eval = validator.Validate(seh, options);
17779     CheckErrors (*eval, expected_errors);
17780 
17781     scope.RemoveTopLevelSeqEntry(seh);
17782     unit_test_util::RevComp(entry);
17783     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = '\xFB';
17784     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = '\xFB';
17785     seh = scope.AddTopLevelSeqEntry(*entry);
17786     CLEAR_ERRORS
17787     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue",
17788                                                   "Invalid residue [251] at position [45]"));
17789     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue",
17790                                                   "Invalid residue [251] at position [46]"));
17791     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor",
17792                                                   "Splice acceptor consensus (AG) not found at end of intron, position 46 of lcl|nuc"));
17793     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor",
17794                                                   "Bad sequence at splice acceptor before exon starting at position 47 of lcl|nuc"));
17795     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Fatal, "NonAsciiAsn", "Non-ASCII character '251' found in item"));
17796     //AddChromosomeNoLocation(expected_errors, entry);
17797     eval = validator.Validate(seh, options);
17798     CheckErrors (*eval, expected_errors);
17799 
17800     scope.RemoveTopLevelSeqEntry(seh);
17801     unit_test_util::RevComp(entry);
17802     seh = scope.AddTopLevelSeqEntry(*entry);
17803     CLEAR_ERRORS
17804     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Critical,
17805                                "InvalidResidue", "Invalid residue [251] at position [15]"));
17806     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Critical, "InvalidResidue",
17807                                "Invalid residue [251] at position [16]"));
17808     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor",
17809                                "Splice acceptor consensus (AG) not found at end of intron, position 15 of lcl|nuc"));
17810     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor",
17811                                "Bad sequence at splice acceptor before exon starting at position 14 of lcl|nuc"));
17812     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Fatal, "NonAsciiAsn", "Non-ASCII character '251' found in item"));
17813     //AddChromosomeNoLocation(expected_errors, entry);
17814 
17815     eval = validator.Validate(seh, options);
17816     CheckErrors (*eval, expected_errors);
17817 
17818     CLEAR_ERRORS
17819 
17820     scope.RemoveTopLevelSeqEntry(seh);
17821     entry = unit_test_util::BuildGoodSeq();
17822     intron = unit_test_util::AddMiscFeature(entry);
17823     intron->SetData().SetImp().SetKey("intron");
17824     seh = scope.AddTopLevelSeqEntry(*entry);
17825     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Info, "NotSpliceConsensusDonorTerminalIntron",
17826                                "Splice donor consensus (GT) not found at start of terminal intron, position 1 of lcl|good"));
17827     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
17828                                "Splice acceptor consensus (AG) not found at end of intron, position 11 of lcl|good"));
17829     //AddChromosomeNoLocation(expected_errors, entry);
17830     eval = validator.Validate(seh, options);
17831     CheckErrors (*eval, expected_errors);
17832 
17833     scope.RemoveTopLevelSeqEntry(seh);
17834     unit_test_util::RevComp(entry);
17835     seh = scope.AddTopLevelSeqEntry(*entry);
17836     CLEAR_ERRORS
17837     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Info, "NotSpliceConsensusDonorTerminalIntron",
17838                                "Splice donor consensus (GT) not found at start of terminal intron, position 60 of lcl|good"));
17839     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
17840                                "Splice acceptor consensus (AG) not found at end of intron, position 50 of lcl|good"));
17841     //AddChromosomeNoLocation(expected_errors, entry);
17842     eval = validator.Validate(seh, options);
17843     CheckErrors (*eval, expected_errors);
17844 
17845     CLEAR_ERRORS
17846 }
17847 
17848 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RareSpliceConsensusDonor)17849 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RareSpliceConsensusDonor)
17850 {
17851     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
17852     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
17853     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
17854     cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc->SetSeq().SetId().front()));
17855     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[16] = 'G';
17856     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[17] = 'C';
17857     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = 'A';
17858     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = 'G';
17859     CRef<CSeq_feat> intron = unit_test_util::MakeIntronForMixLoc(nuc->SetSeq().SetId().front());
17860     unit_test_util::AddFeat(intron, nuc);
17861 
17862     STANDARD_SETUP
17863     // no longer report
17864     //AddChromosomeNoLocation(expected_errors, entry);
17865     eval = validator.Validate(seh, options);
17866     CheckErrors (*eval, expected_errors);
17867 
17868     scope.RemoveTopLevelSeqEntry(seh);
17869     unit_test_util::RevComp(entry);
17870     seh = scope.AddTopLevelSeqEntry(*entry);
17871     CLEAR_ERRORS
17872 
17873     //AddChromosomeNoLocation(expected_errors, entry);
17874     eval = validator.Validate(seh, options);
17875     CheckErrors (*eval, expected_errors);
17876 
17877     CLEAR_ERRORS
17878 }
17879 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RareSpliceConsensusDonor_VR_65)17880 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RareSpliceConsensusDonor_VR_65)
17881 {
17882     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
17883     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
17884     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
17885     cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(nuc->SetSeq().SetId().front()));
17886     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[16] = 'A';
17887     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[17] = 'T';
17888     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[44] = 'A';
17889     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set()[45] = 'C';
17890     CRef<CSeq_feat> intron = unit_test_util::MakeIntronForMixLoc(nuc->SetSeq().SetId().front());
17891     unit_test_util::AddFeat(intron, nuc);
17892 
17893     STANDARD_SETUP
17894 
17895     // no longer report
17896     eval = validator.Validate(seh, options);
17897     //AddChromosomeNoLocation(expected_errors, entry);
17898     CheckErrors (*eval, expected_errors);
17899 
17900     scope.RemoveTopLevelSeqEntry(seh);
17901     unit_test_util::RevComp(entry);
17902     seh = scope.AddTopLevelSeqEntry(*entry);
17903     CLEAR_ERRORS
17904     // no longer report
17905     //AddChromosomeNoLocation(expected_errors, entry);
17906     eval = validator.Validate(seh, options);
17907     CheckErrors (*eval, expected_errors);
17908 
17909     CLEAR_ERRORS
17910 }
17911 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SeqFeatXrefNotReciprocal)17912 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SeqFeatXrefNotReciprocal)
17913 {
17914     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
17915     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
17916     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
17917     cds->SetId().SetLocal().SetId(1);
17918     CRef<CSeq_feat> mrna = unit_test_util::MakemRNAForCDS(cds);
17919     mrna->SetId().SetLocal().SetId(2);
17920     unit_test_util::AddFeat (mrna, nuc);
17921     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature (mrna);
17922     unit_test_util::AddFeat (gene, nuc);
17923     gene->SetId().SetLocal().SetId(3);
17924 
17925     cds->AddSeqFeatXref(mrna->GetId());
17926     mrna->AddSeqFeatXref(gene->GetId());
17927 
17928 
17929     STANDARD_SETUP
17930     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefNotReciprocal",
17931                                "Cross-referenced feature does not link reciprocally"));
17932     //AddChromosomeNoLocation(expected_errors, entry);
17933 
17934     eval = validator.Validate(seh, options);
17935     CheckErrors (*eval, expected_errors);
17936 
17937     CLEAR_ERRORS
17938 }
17939 
17940 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SeqFeatXrefFeatureMissing)17941 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_SeqFeatXrefFeatureMissing)
17942 {
17943     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
17944     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
17945     cds->SetId().SetLocal().SetId(1);
17946     CRef<CSeqFeatXref> x1(new CSeqFeatXref());
17947     x1->SetId().SetLocal().SetId(2);
17948     cds->SetXref().push_back(x1);
17949     STANDARD_SETUP
17950     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "SeqFeatXrefFeatureMissing",
17951                                "Cross-referenced feature cannot be found"));
17952     //AddChromosomeNoLocation(expected_errors, entry);
17953 
17954     eval = validator.Validate(seh, options);
17955     CheckErrors (*eval, expected_errors);
17956 
17957     CLEAR_ERRORS
17958 }
17959 
17960 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureInsideGap)17961 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureInsideGap)
17962 {
17963     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
17964     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
17965     misc->SetLocation().SetInt().SetFrom(12);
17966     misc->SetLocation().SetInt().SetTo(20);
17967 
17968     STANDARD_SETUP
17969     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "FeatureInsideGap",
17970                                "Feature inside sequence gap"));
17971     //AddChromosomeNoLocation(expected_errors, entry);
17972 
17973     eval = validator.Validate(seh, options);
17974     CheckErrors (*eval, expected_errors);
17975 
17976     CLEAR_ERRORS
17977     scope.RemoveTopLevelSeqEntry(seh);
17978     CRef<CDelta_seq> gap_seg(new CDelta_seq());
17979     gap_seg->SetLiteral().SetSeq_data().SetGap();
17980     gap_seg->SetLiteral().SetLength(10);
17981     entry->SetSeq().SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
17982     entry->SetSeq().SetInst().SetExt().SetDelta().AddLiteral("CCCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGATGATG", CSeq_inst::eMol_dna);
17983     entry->SetSeq().SetInst().SetLength(116);
17984     misc->SetLocation().SetInt().SetFrom(48);
17985     misc->SetLocation().SetInt().SetTo(98);
17986     seh = scope.AddTopLevelSeqEntry(*entry);
17987     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "HighNContentPercent",
17988                                "Sequence contains 51 percent Ns"));
17989     /*
17990     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
17991         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
17992     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
17993         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
17994     */
17995     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "FeatureIsMostlyNs",
17996         "Feature contains more than 50% Ns"));
17997     //AddChromosomeNoLocation(expected_errors, entry);
17998 
17999     eval = validator.Validate(seh, options);
18000     CheckErrors (*eval, expected_errors);
18001 
18002     CLEAR_ERRORS
18003 }
18004 
18005 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureCrossesGap)18006 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureCrossesGap)
18007 {
18008     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
18009     NON_CONST_ITERATE (CDelta_ext::Tdata, it, entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
18010         if ((*it)->IsLiteral() && (*it)->GetLiteral().GetSeq_data().IsGap()) {
18011             (*it)->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
18012         }
18013     }
18014 
18015     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
18016     misc->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
18017     misc->SetLocation().SetInt().SetFrom(5);
18018     misc->SetLocation().SetInt().SetTo(30);
18019 
18020     STANDARD_SETUP
18021     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
18022                               "No CDS location match for 1 mRNA"));
18023     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "FeatureCrossesGap",
18024                               "Feature crosses gap of unknown length"));
18025     //AddChromosomeNoLocation(expected_errors, entry);
18026 
18027     eval = validator.Validate(seh, options);
18028     CheckErrors (*eval, expected_errors);
18029 
18030     CLEAR_ERRORS
18031 
18032     scope.RemoveTopLevelSeqEntry(seh);
18033     CRef<CSeq_loc> int1(new CSeq_loc());
18034     int1->SetInt().SetFrom(3);
18035     int1->SetInt().SetTo(15);
18036     int1->SetInt().SetId().SetLocal().SetStr("good");
18037     CRef<CSeq_loc> int2(new CSeq_loc());
18038     int2->SetInt().SetFrom(22);
18039     int2->SetInt().SetTo(30);
18040     int2->SetInt().SetId().SetLocal().SetStr("good");
18041     misc->SetLocation().SetMix().Set().push_back(int1);
18042     misc->SetLocation().SetMix().Set().push_back(int2);
18043     seh = scope.AddTopLevelSeqEntry(*entry);
18044     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
18045         "No CDS location match for 1 mRNA"));
18046     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "IntervalBeginsOrEndsInGap",
18047         "Internal interval begins or ends in gap"));
18048     //AddChromosomeNoLocation(expected_errors, entry);
18049 
18050     eval = validator.Validate(seh, options);
18051     CheckErrors(*eval, expected_errors);
18052 
18053     CLEAR_ERRORS
18054 
18055 }
18056 
18057 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadAuthorSuffix)18058 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadAuthorSuffix)
18059 {
18060     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18061     CRef<CPub> pub = unit_test_util::BuildGoodArticlePub();
18062     CRef<CSeqdesc> desc(new CSeqdesc());
18063     desc->SetPub().SetPub().Set().push_back(pub);
18064     entry->SetDescr().Set().push_back(desc);
18065     pub->SetArticle().SetAuthors().SetNames().SetStd().front()->SetName().SetName().SetSuffix("foo");
18066 
18067     STANDARD_SETUP
18068     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "BadAuthorSuffix",
18069                                "Bad author suffix foo"));
18070     //AddChromosomeNoLocation(expected_errors, entry);
18071 
18072     eval = validator.Validate(seh, options);
18073     CheckErrors (*eval, expected_errors);
18074 
18075     CLEAR_ERRORS
18076 
18077     // don't report good suffixes
18078     pub->SetArticle().SetAuthors().SetNames().SetStd().front()->SetName().SetName().SetSuffix("3rd");
18079     //AddChromosomeNoLocation(expected_errors, entry);
18080     eval = validator.Validate(seh, options);
18081     CheckErrors (*eval, expected_errors);
18082 
18083     CLEAR_ERRORS
18084 }
18085 
18086 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadAnticodonAA)18087 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadAnticodonAA)
18088 {
18089     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18090     CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
18091     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(8);
18092     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(10);
18093     trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('S');
18094     unit_test_util::AddFeat(trna, entry);
18095 
18096     STANDARD_SETUP
18097     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "BadAnticodonAA",
18098                                "Codons predicted from anticodon (AAA) cannot produce amino acid (S/Ser)"));
18099     //AddChromosomeNoLocation(expected_errors, entry);
18100 
18101     eval = validator.Validate(seh, options);
18102     CheckErrors (*eval, expected_errors);
18103 
18104     CLEAR_ERRORS
18105 }
18106 
18107 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadAnticodonCodon)18108 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadAnticodonCodon)
18109 {
18110     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18111     CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
18112     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(8);
18113     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(10);
18114     trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('K');
18115     trna->SetData().SetRna().SetExt().SetTRNA().SetCodon().push_back(42);
18116     unit_test_util::AddFeat(trna, entry);
18117 
18118     STANDARD_SETUP
18119     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "BadAnticodonAA",
18120                                "Codons predicted from anticodon (AAA) cannot produce amino acid (K/Lys)"));
18121     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "BadAnticodonCodon",
18122                                "Codon recognized cannot be produced from anticodon (AAA)"));
18123     //AddChromosomeNoLocation(expected_errors, entry);
18124 
18125     eval = validator.Validate(seh, options);
18126     CheckErrors (*eval, expected_errors);
18127 
18128     CLEAR_ERRORS
18129 }
18130 
18131 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadAnticodonStrand)18132 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadAnticodonStrand)
18133 {
18134     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18135     CRef<CSeq_feat> trna = unit_test_util::BuildtRNA(entry->SetSeq().SetId().front());
18136     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetFrom(8);
18137     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetTo(10);
18138     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().SetStrand (eNa_strand_minus);
18139     trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('K');
18140     unit_test_util::AddFeat(trna, entry);
18141 
18142     STANDARD_SETUP
18143     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Error, "AnticodonStrandConflict",
18144                                "Anticodon strand and tRNA strand do not match."));
18145     //AddChromosomeNoLocation(expected_errors, entry);
18146 
18147     eval = validator.Validate(seh, options);
18148     CheckErrors (*eval, expected_errors);
18149 
18150     scope.RemoveTopLevelSeqEntry(seh);
18151     trna->SetData().SetRna().SetExt().SetTRNA().SetAnticodon().SetInt().ResetStrand();
18152     trna->SetLocation().SetInt().SetStrand(eNa_strand_minus);
18153     trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('F');
18154     seh = scope.AddTopLevelSeqEntry(*entry);
18155     eval = validator.Validate(seh, options);
18156     CheckErrors (*eval, expected_errors);
18157 
18158     CLEAR_ERRORS
18159 }
18160 
18161 
18162 #define test_gene_syn(name) \
18163     gene->SetData().SetGene().ResetSyn(); \
18164     gene->SetData().SetGene().SetSyn().push_back(name); \
18165     msg = "Uninformative gene synonym '"; \
18166     msg.append(name); \
18167     msg.append("'"); \
18168     expected_errors[0]->SetErrMsg(msg); \
18169     eval = validator.Validate(seh, options); \
18170     CheckErrors (*eval, expected_errors);
18171 
18172 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UndesiredGeneSynonym)18173 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UndesiredGeneSynonym)
18174 {
18175     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18176     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
18177     CRef<CSeq_feat> gene = unit_test_util::AddMiscFeature(entry);
18178     gene->SetData().SetGene().SetLocus("something");
18179     string msg = "";
18180 
18181     STANDARD_SETUP
18182 
18183     expected_errors.push_back (new CExpectedError("ref|NC_123456|", eDiag_Warning, "UndesiredGeneSynonym", ""));
18184     //AddChromosomeNoLocation(expected_errors, entry);
18185 
18186     test_gene_syn("alpha")
18187     test_gene_syn("alternative")
18188     test_gene_syn("beta")
18189     test_gene_syn("cellular")
18190     test_gene_syn("cytokine")
18191     test_gene_syn("delta")
18192     test_gene_syn("drosophila")
18193     test_gene_syn("epsilon")
18194     test_gene_syn("gamma")
18195     test_gene_syn("HLA")
18196     test_gene_syn("homolog")
18197     test_gene_syn("mouse")
18198     test_gene_syn("orf")
18199     test_gene_syn("partial")
18200     test_gene_syn("plasma")
18201     test_gene_syn("precursor")
18202     test_gene_syn("pseudogene")
18203     test_gene_syn("putative")
18204     test_gene_syn("rearranged")
18205     test_gene_syn("small")
18206     test_gene_syn("trna")
18207     test_gene_syn("unknown")
18208     test_gene_syn("unknown function")
18209     test_gene_syn("unknown protein")
18210     test_gene_syn("unnamed")
18211 
18212 
18213     gene->SetData().SetGene().ResetSyn();
18214     gene->SetData().SetGene().SetSyn().push_back("same_as");
18215     gene->SetData().SetGene().SetLocus("same_as");
18216     expected_errors[0]->SetErrMsg("gene synonym has same value as gene locus");
18217     eval = validator.Validate(seh, options);
18218     CheckErrors (*eval, expected_errors);
18219 
18220     gene->SetData().SetGene().ResetSyn();
18221     gene->SetData().SetGene().SetDesc("same_as");
18222     expected_errors[0]->SetErrMsg("gene description has same value as gene locus");
18223     eval = validator.Validate(seh, options);
18224     CheckErrors (*eval, expected_errors);
18225 
18226     gene->SetData().SetGene().ResetDesc();
18227     gene->SetData().SetGene().ResetLocus();
18228     gene->SetData().SetGene().SetSyn().push_back("only_syn");
18229     expected_errors[0]->SetErrMsg("gene synonym without gene locus or description");
18230     eval = validator.Validate(seh, options);
18231     CheckErrors (*eval, expected_errors);
18232 
18233 
18234     CLEAR_ERRORS
18235 }
18236 
18237 
18238 #define test_undesired_protein_name(name) \
18239     prot->SetData().SetProt().ResetName(); \
18240     prot->SetData().SetProt().SetName().push_back(name); \
18241     msg = "Uninformative protein name '"; \
18242     msg.append(name); \
18243     msg.append("'"); \
18244     expected_errors[0]->SetErrMsg(msg); \
18245     eval = validator.Validate(seh, options); \
18246     CheckErrors (*eval, expected_errors);
18247 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UndesiredProteinName)18248 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UndesiredProteinName)
18249 {
18250     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
18251     CRef<CSeq_id> id (new CSeq_id());
18252     id->SetOther().SetAccession("NC_123456");
18253     unit_test_util::ChangeNucProtSetNucId(entry, id);
18254     CRef<CSeq_feat> prot = unit_test_util::GetProtFeatFromGoodNucProtSet(entry);
18255 
18256     STANDARD_SETUP
18257 
18258     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "UndesiredProteinName",
18259                               ""));
18260     //AddChromosomeNoLocation(expected_errors, entry);
18261     string msg;
18262 
18263     test_undesired_protein_name("a=b")
18264     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadInternalCharacter",
18265                               "Protein name contains undesired character"));
18266     test_undesired_protein_name("a~b")
18267     delete expected_errors[1];
18268     expected_errors.pop_back();
18269     test_undesired_protein_name("uniprot protein")
18270     test_undesired_protein_name("uniprotkb protein")
18271     test_undesired_protein_name("refers to pmid 23")
18272     test_undesired_protein_name("refers to dbxref")
18273     // test_undesired_protein_name("hypothetical protein")
18274     test_undesired_protein_name("uncharacterized conserved membrane protein")
18275     test_undesired_protein_name("unknown; predicted coding region")
18276     test_undesired_protein_name("unnamed")
18277 
18278     CLEAR_ERRORS
18279 }
18280 
18281 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureBeginsOrEndsInGap)18282 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_FeatureBeginsOrEndsInGap)
18283 {
18284     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
18285     NON_CONST_ITERATE (CDelta_ext::Tdata, it, entry->SetSeq().SetInst().SetExt().SetDelta().Set()) {
18286         if ((*it)->IsLiteral() && (*it)->GetLiteral().GetSeq_data().IsGap()) {
18287             (*it)->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
18288         }
18289     }
18290 
18291     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
18292     misc->SetLocation().SetInt().SetFrom(5);
18293     misc->SetLocation().SetInt().SetTo(20);
18294 
18295     STANDARD_SETUP
18296     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "FeatureBeginsOrEndsInGap",
18297                                "Feature begins or ends in gap starting at 13"));
18298     //AddChromosomeNoLocation(expected_errors, entry);
18299 
18300     eval = validator.Validate(seh, options);
18301     CheckErrors (*eval, expected_errors);
18302 
18303     scope.RemoveTopLevelSeqEntry(seh);
18304     misc->SetLocation().SetInt().SetStrand(eNa_strand_minus);
18305     seh = scope.AddTopLevelSeqEntry(*entry);
18306     eval = validator.Validate(seh, options);
18307     CheckErrors(*eval, expected_errors);
18308 
18309     scope.RemoveTopLevelSeqEntry(seh);
18310     misc->SetLocation().SetInt().SetFrom(14);
18311     misc->SetLocation().SetInt().SetTo(30);
18312     seh = scope.AddTopLevelSeqEntry(*entry);
18313 
18314     eval = validator.Validate(seh, options);
18315     CheckErrors (*eval, expected_errors);
18316 
18317     scope.RemoveTopLevelSeqEntry(seh);
18318     misc->SetLocation().SetInt().ResetStrand();
18319     seh = scope.AddTopLevelSeqEntry(*entry);
18320     eval = validator.Validate(seh, options);
18321     CheckErrors(*eval, expected_errors);
18322 
18323     CLEAR_ERRORS
18324 }
18325 
18326 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneOntologyTermMissingGOID)18327 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneOntologyTermMissingGOID)
18328 {
18329     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18330     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
18331     feat->SetExt().SetType().SetStr("GeneOntology");
18332     CRef<CUser_field> go_list(new CUser_field());
18333     go_list->SetLabel().SetStr("Process");
18334     CRef<CUser_field> go_term (new CUser_field());
18335     go_term->SetLabel().SetStr("a go term");
18336 
18337     CRef<CUser_field> pmid(new CUser_field());
18338     pmid->SetLabel().SetStr("pubmed id");
18339     pmid->SetData().SetInt(4);
18340     go_term->SetData().SetFields().push_back (pmid);
18341 
18342     CRef<CUser_field> term(new CUser_field());
18343     term->SetLabel().SetStr("text string");
18344     term->SetData().SetStr("something");
18345     go_term->SetData().SetFields().push_back (term);
18346 
18347     CRef<CUser_field> ev(new CUser_field());
18348     ev->SetLabel().SetStr("evidence");
18349     ev->SetData().SetStr("some evidence");
18350     go_term->SetData().SetFields().push_back (ev);
18351 
18352     go_list->SetData().SetFields().push_back(go_term);
18353     feat->SetExt().SetData().push_back(go_list);
18354 
18355     STANDARD_SETUP
18356 
18357     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "GeneOntologyTermMissingGOID",
18358                                 "GO term does not have GO identifier"));
18359     //AddChromosomeNoLocation(expected_errors, entry);
18360     eval = validator.Validate(seh, options);
18361     CheckErrors (*eval, expected_errors);
18362 
18363     CLEAR_ERRORS
18364 }
18365 
18366 
18367 // note - this test also covers PseudoRnaViaGeneHasProduct
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PseudoRnaHasProduct)18368 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PseudoRnaHasProduct)
18369 {
18370     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18371     CRef<CSeq_feat> rna = unit_test_util::AddMiscFeature(entry);
18372     rna->ResetComment();
18373     rna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
18374     rna->SetPseudo(true);
18375     rna->SetProduct().SetWhole().SetGenbank().SetAccession("AY123456");
18376 
18377     STANDARD_SETUP
18378 
18379     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "PseudoRnaHasProduct",
18380                                 "A pseudo RNA should not have a product"));
18381     //AddChromosomeNoLocation(expected_errors, entry);
18382     eval = validator.Validate(seh, options);
18383     CheckErrors (*eval, expected_errors);
18384 
18385     // this exception should turn off the warning
18386     rna->SetExcept(true);
18387     rna->SetExcept_text("transcribed pseudogene");
18388     CLEAR_ERRORS
18389     //AddChromosomeNoLocation(expected_errors, entry);
18390     eval = validator.Validate(seh, options);
18391     CheckErrors (*eval, expected_errors);
18392 
18393     // should get error if overlapping gene is pseudo (and not except text)
18394     scope.RemoveTopLevelSeqEntry(seh);
18395     rna->ResetExcept();
18396     rna->ResetExcept_text();
18397     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature(rna);
18398     gene->SetPseudo(true);
18399     unit_test_util::AddFeat(gene, entry);
18400     seh = scope.AddTopLevelSeqEntry(*entry);
18401     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "PseudoRnaHasProduct",
18402                                 "A pseudo RNA should not have a product"));
18403     eval = validator.Validate(seh, options);
18404     CheckErrors (*eval, expected_errors);
18405 
18406 
18407     // now get PseudoRnaViaGeneHasProduct when rna is not pseudo itself
18408     rna->ResetPseudo();
18409     CLEAR_ERRORS
18410     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "PseudoRnaViaGeneHasProduct",
18411                                 "An RNA overlapped by a pseudogene should not have a product"));
18412     //AddChromosomeNoLocation(expected_errors, entry);
18413     eval = validator.Validate(seh, options);
18414     CheckErrors (*eval, expected_errors);
18415 
18416     CLEAR_ERRORS
18417 }
18418 
18419 
18420 // note - this test also covers PseudoRnaViaGeneHasProduct
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_VR_803)18421 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_VR_803)
18422 {
18423     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18424     CRef<CSeq_id> id(new CSeq_id("NC_000001.1"));
18425     unit_test_util::ChangeId(entry, id);
18426     CRef<CSeq_feat> rna = unit_test_util::AddMiscFeature(entry);
18427     rna->ResetComment();
18428     rna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
18429     rna->SetPseudo(true);
18430     rna->SetProduct().SetWhole().SetGenbank().SetAccession("AY123456");
18431 
18432     STANDARD_SETUP
18433 
18434     //AddChromosomeNoLocation(expected_errors, entry);
18435 
18436     // no error expected because RefSeq
18437     eval = validator.Validate(seh, options);
18438     CheckErrors (*eval, expected_errors);
18439 
18440     // should get error if overlapping gene is pseudo (and not except text)
18441     scope.RemoveTopLevelSeqEntry(seh);
18442     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature(rna);
18443     gene->SetPseudo(true);
18444     unit_test_util::AddFeat(gene, entry);
18445     seh = scope.AddTopLevelSeqEntry(*entry);
18446     eval = validator.Validate(seh, options);
18447     CheckErrors (*eval, expected_errors);
18448 
18449 
18450     // now get PseudoRnaViaGeneHasProduct when rna is not pseudo itself
18451     rna->ResetPseudo();
18452     eval = validator.Validate(seh, options);
18453     CheckErrors (*eval, expected_errors);
18454 
18455     CLEAR_ERRORS
18456 }
18457 
18458 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadRRNAcomponentOrder)18459 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadRRNAcomponentOrder)
18460 {
18461     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18462     CRef<CSeq_feat> r1(new CSeq_feat());
18463     r1->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
18464     r1->SetData().SetRna().SetExt().SetName("26S ribosomal RNA");
18465     r1->SetLocation().SetInt().SetId().Assign(*(entry->SetSeq().SetId().front()));
18466     r1->SetLocation().SetInt().SetFrom(0);
18467     r1->SetLocation().SetInt().SetTo(10);
18468     unit_test_util::AddFeat(r1, entry);
18469     CRef<CSeq_feat> r2(new CSeq_feat());
18470     r2->SetData().SetRna().SetType(CRNA_ref::eType_miscRNA);
18471     r2->SetData().SetRna().SetExt().SetName("internal transcribed spacer 2");
18472     r2->SetLocation().SetInt().SetId().Assign(*(entry->SetSeq().SetId().front()));
18473     r2->SetLocation().SetInt().SetFrom(11);
18474     r2->SetLocation().SetInt().SetTo(20);
18475     unit_test_util::AddFeat(r2, entry);
18476     CRef<CSeq_feat> r3(new CSeq_feat());
18477     r3->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
18478     r3->SetData().SetRna().SetExt().SetName("16S ribosomal RNA");
18479     r3->SetLocation().SetInt().SetId().Assign(*(entry->SetSeq().SetId().front()));
18480     r3->SetLocation().SetInt().SetFrom(21);
18481     r3->SetLocation().SetInt().SetTo(30);
18482     unit_test_util::AddFeat(r3, entry);
18483 
18484 
18485     STANDARD_SETUP
18486 
18487     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "BadRRNAcomponentOrder",
18488                                 "Problem with order of abutting rRNA components"));
18489     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadRRNAcomponentOrder",
18490         "Problem with order of abutting rRNA components"));
18491     //AddChromosomeNoLocation(expected_errors, entry);
18492     eval = validator.Validate(seh, options);
18493     CheckErrors (*eval, expected_errors);
18494 
18495     scope.RemoveTopLevelSeqEntry(seh);
18496     unit_test_util::RevComp(entry);
18497     seh = scope.AddTopLevelSeqEntry(*entry);
18498 
18499     eval = validator.Validate(seh, options);
18500     CheckErrors (*eval, expected_errors);
18501 
18502     CLEAR_ERRORS
18503 
18504     // no errors if organelle
18505     SetGenome(entry, CBioSource::eGenome_chloroplast);
18506     //AddChromosomeNoLocation(expected_errors, entry);
18507     eval = validator.Validate(seh, options);
18508     CheckErrors (*eval, expected_errors);
18509 
18510     CLEAR_ERRORS
18511 }
18512 
18513 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MissingGeneLocusTag)18514 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MissingGeneLocusTag)
18515 {
18516     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18517     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_123456");
18518     CRef<CSeq_feat> gene1 = unit_test_util::AddMiscFeature (entry);
18519     gene1->ResetComment();
18520     gene1->SetData().SetGene().SetLocus("a");
18521     gene1->SetData().SetGene().SetLocus_tag("tag1");
18522     CRef<CSeq_feat> gene2 = unit_test_util::AddMiscFeature (entry);
18523     gene2->ResetComment();
18524     gene2->SetData().SetGene().SetLocus("b");
18525     gene2->SetLocation().SetInt().SetFrom(20);
18526     gene2->SetLocation().SetInt().SetTo(30);
18527 
18528     STANDARD_SETUP
18529 
18530     expected_errors.push_back (new CExpectedError("ref|NC_123456|", eDiag_Warning, "MissingGeneLocusTag",
18531                                 "Missing gene locus tag"));
18532     //AddChromosomeNoLocation(expected_errors, entry);
18533     eval = validator.Validate(seh, options);
18534     CheckErrors (*eval, expected_errors);
18535 
18536     CLEAR_ERRORS
18537 }
18538 
18539 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleProtRefs)18540 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleProtRefs)
18541 {
18542     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
18543     CRef<CSeq_entry> prot_seq = unit_test_util::GetProteinSequenceFromGoodNucProtSet(entry);
18544     CRef<CSeq_feat> prot2 = unit_test_util::AddMiscFeature(prot_seq);
18545     prot2->SetData().SetProt().SetName().push_back("a second protein name");
18546     prot2->SetLocation().SetInt().SetTo(prot_seq->GetSeq().GetInst().GetLength()-1);
18547     STANDARD_SETUP
18548 
18549     expected_errors.push_back (new CExpectedError("lcl|prot", eDiag_Warning, "MultipleProtRefs",
18550                                "2 full-length protein features present on protein"));
18551     expected_errors.push_back (new CExpectedError("lcl|prot", eDiag_Warning, "DuplicateFeat",
18552                               "Features have identical intervals, but labels differ"));
18553     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "ExtraProteinFeature",
18554                               "Protein sequence has multiple unprocessed protein features"));
18555     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "ExtraProteinFeature",
18556                               "Protein sequence has multiple unprocessed protein features"));
18557     //AddChromosomeNoLocation(expected_errors, entry);
18558     eval = validator.Validate(seh, options);
18559     CheckErrors (*eval, expected_errors);
18560 
18561     CLEAR_ERRORS
18562 }
18563 
18564 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadInternalCharacter)18565 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadInternalCharacter)
18566 {
18567     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
18568     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
18569     CRef<CSeq_feat> prot = unit_test_util::GetProtFeatFromGoodNucProtSet(entry);
18570     prot->SetData().SetProt().ResetName();
18571     prot->SetData().SetProt().SetName().push_back("name~something");
18572     CRef<CSeq_feat>  cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
18573 
18574     CRef<CSeq_feat> mrna = unit_test_util::MakemRNAForCDS(cds);
18575     mrna->SetData().SetRna().SetExt().SetName("name~something");
18576     unit_test_util::AddFeat(mrna, nuc);
18577     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature(mrna);
18578     gene->SetData().SetGene().SetLocus("gene?something");
18579     unit_test_util::AddFeat(gene, nuc);
18580 
18581     CRef<CSeq_feat> rrna = unit_test_util::AddMiscFeature(nuc);
18582     rrna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
18583     rrna->SetData().SetRna().SetExt().SetName("rna!something");
18584 
18585     STANDARD_SETUP
18586     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "BadInternalCharacter",
18587                               "mRNA name contains undesired character"));
18588     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "BadInternalCharacter",
18589                               "Gene locus contains undesired character"));
18590     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadInternalCharacter",
18591                               "rRNA name contains undesired character"));
18592     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadInternalCharacter",
18593                               "Protein name contains undesired character"));
18594     //AddChromosomeNoLocation(expected_errors, entry);
18595     eval = validator.Validate(seh, options);
18596     CheckErrors (*eval, expected_errors);
18597 
18598     CLEAR_ERRORS
18599 }
18600 
18601 
BOOST_AUTO_TEST_CASE(Test_VR_746)18602 BOOST_AUTO_TEST_CASE(Test_VR_746)
18603 {
18604     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
18605     CRef<CSeq_feat>  cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
18606     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
18607     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature(cds);
18608     gene->SetData().SetGene().SetLocus("gene|synonym");
18609     unit_test_util::AddFeat(gene, nuc);
18610 
18611     STANDARD_SETUP
18612     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "BadInternalCharacter",
18613                               "Gene locus contains undesired character"));
18614     //AddChromosomeNoLocation(expected_errors, entry);
18615 
18616     eval = validator.Validate(seh, options);
18617     CheckErrors(*eval, expected_errors);
18618 
18619     CLEAR_ERRORS
18620 }
18621 
18622 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadTrailingCharacter)18623 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadTrailingCharacter)
18624 {
18625     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
18626     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
18627     CRef<CSeq_feat> prot = unit_test_util::GetProtFeatFromGoodNucProtSet(entry);
18628     prot->SetData().SetProt().ResetName();
18629     prot->SetData().SetProt().SetName().push_back("name something,");
18630     CRef<CSeq_feat>  cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
18631 
18632     CRef<CSeq_feat> mrna = unit_test_util::MakemRNAForCDS(cds);
18633     mrna->SetData().SetRna().SetExt().SetName("name something_");
18634     unit_test_util::AddFeat(mrna, nuc);
18635     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature(mrna);
18636     gene->SetData().SetGene().SetLocus("gene something;");
18637     unit_test_util::AddFeat(gene, nuc);
18638 
18639     CRef<CSeq_feat> rrna = unit_test_util::AddMiscFeature(nuc);
18640     rrna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
18641     rrna->SetData().SetRna().SetExt().SetName("rna something:");
18642 
18643     STANDARD_SETUP
18644     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "BadTrailingCharacter",
18645                               "mRNA name ends with undesired character"));
18646     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "BadTrailingCharacter",
18647                               "Gene locus ends with undesired character"));
18648     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadTrailingCharacter",
18649                               "rRNA name ends with undesired character"));
18650     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadTrailingCharacter",
18651                               "Protein name ends with undesired character"));
18652     //AddChromosomeNoLocation(expected_errors, entry);
18653     eval = validator.Validate(seh, options);
18654     CheckErrors (*eval, expected_errors);
18655 
18656     CLEAR_ERRORS
18657 }
18658 
18659 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadTrailingHyphen)18660 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadTrailingHyphen)
18661 {
18662     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
18663     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
18664     CRef<CSeq_feat> prot = unit_test_util::GetProtFeatFromGoodNucProtSet(entry);
18665     prot->SetData().SetProt().ResetName();
18666     prot->SetData().SetProt().SetName().push_back("name something-");
18667     CRef<CSeq_feat>  cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
18668 
18669     CRef<CSeq_feat> mrna = unit_test_util::MakemRNAForCDS(cds);
18670     mrna->SetData().SetRna().SetExt().SetName("name something-");
18671     unit_test_util::AddFeat(mrna, nuc);
18672     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature(mrna);
18673     gene->SetData().SetGene().SetLocus("gene something-");
18674     unit_test_util::AddFeat(gene, nuc);
18675 
18676     CRef<CSeq_feat> rrna = unit_test_util::AddMiscFeature(nuc);
18677     rrna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
18678     rrna->SetData().SetRna().SetExt().SetName("rna something-");
18679 
18680     STANDARD_SETUP
18681     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "BadTrailingHyphen",
18682                               "mRNA name ends with hyphen"));
18683     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "BadTrailingHyphen",
18684                               "Gene locus ends with hyphen"));
18685     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "BadTrailingHyphen",
18686                               "rRNA name ends with hyphen"));
18687     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "BadTrailingHyphen",
18688                               "Protein name ends with hyphen"));
18689     //AddChromosomeNoLocation(expected_errors, entry);
18690     eval = validator.Validate(seh, options);
18691     CheckErrors (*eval, expected_errors);
18692 
18693     CLEAR_ERRORS
18694 }
18695 
18696 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleGeneOverlap)18697 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_MultipleGeneOverlap)
18698 {
18699     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18700     CRef<CSeq_feat> gene1 = unit_test_util::AddMiscFeature(entry);
18701     gene1->SetData().SetGene().SetLocus("a");
18702     gene1->SetLocation().SetInt().SetFrom(0);
18703     gene1->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength()-1);
18704     CRef<CSeq_feat> gene2 = unit_test_util::AddMiscFeature(entry);
18705     gene2->SetData().SetGene().SetLocus("b");
18706     CRef<CSeq_feat> gene3 = unit_test_util::AddMiscFeature(entry);
18707     gene3->SetData().SetGene().SetLocus("c");
18708     gene3->SetLocation().SetInt().SetFrom(11);
18709     gene3->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength()-1);
18710 
18711     STANDARD_SETUP
18712     // no error for only two genes
18713     //AddChromosomeNoLocation(expected_errors, entry);
18714     eval = validator.Validate(seh, options);
18715     CheckErrors (*eval, expected_errors);
18716 
18717     scope.RemoveTopLevelSeqEntry(seh);
18718     CRef<CSeq_feat> gene4 = unit_test_util::AddMiscFeature(entry);
18719     gene4->SetData().SetGene().SetLocus("d");
18720     gene4->SetLocation().SetInt().SetFrom(15);
18721     gene4->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength()-1);
18722     CRef<CSeq_feat> gene5 = unit_test_util::AddMiscFeature(entry);
18723     gene5->SetData().SetGene().SetLocus("e");
18724     gene5->SetLocation().SetInt().SetFrom(20);
18725     gene5->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength()-1);
18726     CRef<CSeq_feat> gene6 = unit_test_util::AddMiscFeature(entry);
18727     gene6->SetData().SetGene().SetLocus("f");
18728     gene6->SetLocation().SetInt().SetFrom(25);
18729     gene6->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength()-1);
18730     seh = scope.AddTopLevelSeqEntry(*entry);
18731 
18732     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "MultipleGeneOverlap",
18733                               "Gene contains 5 other genes"));
18734     eval = validator.Validate(seh, options);
18735     CheckErrors (*eval, expected_errors);
18736 
18737     CLEAR_ERRORS
18738 }
18739 
18740 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadCharInAuthorLastName)18741 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadCharInAuthorLastName)
18742 {
18743     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18744     CRef<CAuthor> author(new CAuthor());
18745     author->SetName().SetName().SetLast("Gr@nt");
18746     CRef<CPub> pub(new CPub());
18747     pub->SetArticle().SetAuthors().SetNames().SetStd().push_back(author);
18748     CRef<CCit_art::TTitle::C_E> art_title(new CCit_art::TTitle::C_E());
18749     art_title->SetName("article title");
18750     pub->SetArticle().SetTitle().Set().push_back(art_title);
18751     CRef<CSeqdesc> desc(new CSeqdesc());
18752     desc->SetPub().SetPub().Set().push_back(pub);
18753     entry->SetDescr().Set().push_back(desc);
18754 
18755     STANDARD_SETUP
18756     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "BadCharInAuthorLastName",
18757                               "Bad characters in author Gr@nt"));
18758     //AddChromosomeNoLocation(expected_errors, entry);
18759     eval = validator.Validate(seh, options);
18760     CheckErrors (*eval, expected_errors);
18761 
18762     CLEAR_ERRORS
18763 }
18764 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PseudoCDSmRNArange)18765 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PseudoCDSmRNArange)
18766 {
18767     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18768     CRef<CSeq_feat> cds = unit_test_util::AddMiscFeature(entry);
18769     cds->ResetComment();
18770     cds->SetData().SetCdregion();
18771     cds->SetPseudo(true);
18772     cds->SetLocation().Assign(*unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front()));
18773     CRef<CSeq_feat> mrna = unit_test_util::MakemRNAForCDS(cds);
18774     mrna->SetLocation().SetMix().Set().front()->SetInt().SetTo(16);
18775     unit_test_util::AddFeat(mrna, entry);
18776     mrna->SetPseudo(true);
18777 
18778     STANDARD_SETUP
18779     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "CDSmRNAMismatchLocation",
18780                                "No CDS location match for 1 mRNA"));
18781     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Info, "PseudoCDSmRNArange",
18782                               "mRNA contains CDS but internal intron-exon boundaries do not match"));
18783     //AddChromosomeNoLocation(expected_errors, entry);
18784     eval = validator.Validate(seh, options);
18785     CheckErrors (*eval, expected_errors);
18786 
18787     scope.RemoveTopLevelSeqEntry(seh);
18788     mrna->SetLocation().SetMix().Set().back()->SetInt().SetTo(55);
18789     seh = scope.AddTopLevelSeqEntry(*entry);
18790     expected_errors[1]->SetErrMsg("mRNA overlaps or contains CDS but does not completely contain intervals");
18791     eval = validator.Validate(seh, options);
18792     CheckErrors (*eval, expected_errors);
18793     CLEAR_ERRORS
18794 }
18795 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneXrefNeeded)18796 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneXrefNeeded)
18797 {
18798     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
18799     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
18800     AddCDSAndProtForBigGoodNucProtSet(entry, "nuc", "prot2", 30);
18801     CRef<CSeq_feat> cds = entry->SetSet().SetAnnot().front()->SetData().SetFtable().back();
18802     CRef<CSeq_feat> gene1 = unit_test_util::MakeGeneForFeature(cds);
18803     gene1->SetLocation().SetInt().SetFrom(gene1->GetLocation().GetInt().GetFrom() - 3);
18804     gene1->SetData().SetGene().SetLocus("a1");
18805     gene1->SetData().SetGene().SetAllele("x");
18806     unit_test_util::AddFeat(gene1, nuc);
18807     CRef<CSeq_feat> gene2 = unit_test_util::MakeGeneForFeature(cds);
18808     gene2->SetData().SetGene().SetLocus("a1");
18809     gene2->SetData().SetGene().SetAllele("y");
18810     gene2->SetLocation().SetInt().SetTo(gene2->GetLocation().GetInt().GetTo() + 3);
18811     unit_test_util::AddFeat(gene2, nuc);
18812 
18813     STANDARD_SETUP
18814     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "GeneXrefNeeded",
18815                               "Feature overlapped by 2 identical-length equivalent genes but has no cross-reference"));
18816     //AddChromosomeNoLocation(expected_errors, entry);
18817     eval = validator.Validate(seh, options);
18818     CheckErrors (*eval, expected_errors);
18819 
18820     CLEAR_ERRORS
18821 }
18822 
18823 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RubiscoProblem)18824 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RubiscoProblem)
18825 {
18826     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
18827     CRef<CSeq_feat> prot = unit_test_util::GetProtFeatFromGoodNucProtSet(entry);
18828     prot->SetData().SetProt().SetName().pop_back();
18829     prot->SetData().SetProt().SetName().push_back("ribulose bisphosphate");
18830     STANDARD_SETUP
18831     expected_errors.push_back (new CExpectedError("lcl|prot", eDiag_Warning, "RubiscoProblem",
18832                               "Nonstandard ribulose bisphosphate protein name"));
18833     //AddChromosomeNoLocation(expected_errors, entry);
18834     options |= CValidator::eVal_do_rubisco_test;
18835     eval = validator.Validate(seh, options);
18836     CheckErrors (*eval, expected_errors);
18837 
18838     CLEAR_ERRORS
18839 }
18840 
18841 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnqualifiedException)18842 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_UnqualifiedException)
18843 {
18844     CRef<CSeq_entry> entry = unit_test_util::BuildGoodGenProdSet();
18845     CRef<CSeq_feat> mrna = unit_test_util::GetmRNAFromGenProdSet(entry);
18846     mrna->SetExcept(true);
18847     mrna->SetExcept_text("transcribed product replaced");
18848     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGenProdSet(entry);
18849     cds->SetExcept(true);
18850     cds->SetExcept_text("translated product replaced");
18851     CRef<CSeq_entry> genomic = unit_test_util::GetGenomicFromGenProdSet(entry);
18852     genomic->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGGGGAGAAAAACAGAGATAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
18853 
18854     STANDARD_SETUP
18855     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "UnnecessaryException",
18856                               "CDS has unnecessary translated product replaced exception"));
18857     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "mRNAUnnecessaryException",
18858                               "mRNA has transcribed product replaced exception"));
18859     //AddChromosomeNoLocation(expected_errors, entry);
18860     eval = validator.Validate(seh, options);
18861     CheckErrors (*eval, expected_errors);
18862 
18863     CLEAR_ERRORS
18864 }
18865 
18866 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ProteinNameHasPMID)18867 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ProteinNameHasPMID)
18868 {
18869     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
18870     CRef<CSeq_feat> prot = unit_test_util::GetProtFeatFromGoodNucProtSet(entry);
18871     prot->SetData().SetProt().SetName().front().assign("(PMID 1234)");
18872     STANDARD_SETUP
18873     expected_errors.push_back (new CExpectedError("lcl|prot", eDiag_Warning, "ProteinNameHasPMID",
18874                               "Protein name has internal PMID"));
18875     //AddChromosomeNoLocation(expected_errors, entry);
18876     eval = validator.Validate(seh, options);
18877     CheckErrors (*eval, expected_errors);
18878 
18879     CLEAR_ERRORS
18880 }
18881 
18882 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadGeneOntologyFormat)18883 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadGeneOntologyFormat)
18884 {
18885     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18886     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
18887     feat->SetExt().SetType().SetStr("GeneOntology");
18888     CRef<CUser_field> go_list(new CUser_field());
18889     go_list->SetData().SetStr("something");
18890     feat->SetExt().SetData().push_back(go_list);
18891 
18892     STANDARD_SETUP
18893     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "BadGeneOntologyFormat",
18894                               "Bad data format for GO term"));
18895     //AddChromosomeNoLocation(expected_errors, entry);
18896     eval = validator.Validate(seh, options);
18897     CheckErrors (*eval, expected_errors);
18898 
18899     CRef<CUser_field> go_term (new CUser_field());
18900     go_list->SetData().SetFields().push_back (go_term);
18901     expected_errors[0]->SetErrMsg("Unrecognized GO term label [blank]");
18902     eval = validator.Validate(seh, options);
18903     CheckErrors (*eval, expected_errors);
18904 
18905     go_list->SetLabel().SetStr("something");
18906     expected_errors[0]->SetErrMsg("Unrecognized GO term label something");
18907     eval = validator.Validate(seh, options);
18908     CheckErrors (*eval, expected_errors);
18909 
18910     go_list->SetLabel().SetStr("Process");
18911     expected_errors[0]->SetErrMsg("Bad GO term format");
18912     eval = validator.Validate(seh, options);
18913     CheckErrors (*eval, expected_errors);
18914 
18915     CRef<CUser_field> go_field(new CUser_field());
18916     go_term->SetData().SetFields().push_back(go_field);
18917     expected_errors[0]->SetErrMsg("Unrecognized label on GO term qualifier field [blank]");
18918     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GeneOntologyTermMissingGOID",
18919                               "GO term does not have GO identifier"));
18920     eval = validator.Validate(seh, options);
18921     CheckErrors (*eval, expected_errors);
18922 
18923     go_field->SetLabel().SetStr("notlabel");
18924     expected_errors[0]->SetErrMsg("Unrecognized label on GO term qualifier field notlabel");
18925     eval = validator.Validate(seh, options);
18926     CheckErrors (*eval, expected_errors);
18927 
18928     go_field->SetLabel().SetStr("go id");
18929     expected_errors[0]->SetErrMsg("Bad data format for GO term qualifier GO ID");
18930     eval = validator.Validate(seh, options);
18931     CheckErrors (*eval, expected_errors);
18932 
18933     go_field->SetData().SetInt(123);
18934     CRef<CUser_field> go_field2(new CUser_field());
18935     go_field2->SetLabel().SetStr("text string");
18936     go_field2->SetData().SetInt(123);
18937     go_term->SetData().SetFields().push_back(go_field2);
18938     expected_errors[0]->SetErrMsg("Bad data format for GO term qualifier term");
18939     delete expected_errors[1];
18940     expected_errors.pop_back();
18941     eval = validator.Validate(seh, options);
18942     CheckErrors (*eval, expected_errors);
18943 
18944     go_field2->SetData().SetStr("some text");
18945     CRef<CUser_field> go_field3(new CUser_field());
18946     go_field3->SetLabel().SetStr("pubmed id");
18947     go_field3->SetData().SetStr("some text");
18948     go_term->SetData().SetFields().push_back(go_field3);
18949     expected_errors[0]->SetErrMsg("Bad data format for GO term qualifier PMID");
18950     eval = validator.Validate(seh, options);
18951     CheckErrors (*eval, expected_errors);
18952 
18953     go_field3->SetData().SetInt(123);
18954     CRef<CUser_field> go_field4(new CUser_field());
18955     go_field4->SetLabel().SetStr("evidence");
18956     go_field4->SetData().SetInt(123);
18957     go_term->SetData().SetFields().push_back(go_field4);
18958     expected_errors[0]->SetErrMsg("Bad data format for GO term qualifier evidence");
18959     eval = validator.Validate(seh, options);
18960     CheckErrors (*eval, expected_errors);
18961 
18962     CLEAR_ERRORS
18963 }
18964 
18965 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InconsistentGeneOntologyTermAndId)18966 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InconsistentGeneOntologyTermAndId)
18967 {
18968     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18969     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
18970     CRef<CUser_field> term1 = MakeGoTerm("a1", "evidence 1");
18971     AddProcessGoTerm(*feat, term1);
18972     CRef<CUser_field> term2 = MakeGoTerm("a2", "evidence 2");
18973     AddProcessGoTerm(*feat, term2);
18974 
18975     STANDARD_SETUP
18976     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "InconsistentGeneOntologyTermAndId",
18977                               "Inconsistent GO terms for GO ID 123"));
18978     //AddChromosomeNoLocation(expected_errors, entry);
18979     eval = validator.Validate(seh, options);
18980     CheckErrors (*eval, expected_errors);
18981 
18982     CLEAR_ERRORS
18983 }
18984 
18985 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateGeneConflictingLocusTag)18986 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_DuplicateGeneConflictingLocusTag)
18987 {
18988     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
18989     CRef<CSeq_feat> gene1 = unit_test_util::AddMiscFeature(entry);
18990     gene1->SetData().SetGene().SetLocus("gene1");
18991     CRef<CSeq_feat> gene2 = unit_test_util::AddMiscFeature(entry);
18992     gene2->SetData().SetGene().SetLocus("gene1");
18993 
18994 
18995     STANDARD_SETUP
18996     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "FeatContentDup",
18997                                "Duplicate feature"));
18998     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Info, "DuplicateGeneConflictingLocusTag",
18999                               "Colliding names in gene features, but feature locations are identical"));
19000     //AddChromosomeNoLocation(expected_errors, entry);
19001     eval = validator.Validate(seh, options);
19002     CheckErrors (*eval, expected_errors);
19003 
19004     CLEAR_ERRORS
19005 
19006     gene2->SetData().SetGene().SetLocus("GENE1");
19007     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "DuplicateFeat",
19008         "Features have identical intervals, but labels differ"));
19009     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "DuplicateGeneConflictingLocusTag",
19010         "Colliding names (with different capitalization) in gene features, but feature locations are identical"));
19011     //AddChromosomeNoLocation(expected_errors, entry);
19012     eval = validator.Validate(seh, options);
19013     CheckErrors (*eval, expected_errors);
19014 
19015     CLEAR_ERRORS
19016 }
19017 
19018 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ReplicatedGeneSequence)19019 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ReplicatedGeneSequence)
19020 {
19021     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
19022     CRef<CSeq_feat> gene1 = unit_test_util::AddMiscFeature(entry);
19023     gene1->SetData().SetGene().SetLocus("gene1");
19024     CRef<CSeq_feat> gene2 = unit_test_util::AddMiscFeature(entry);
19025     gene2->SetData().SetGene().SetLocus("gene1");
19026     gene2->SetLocation().SetInt().SetFrom(30);
19027     gene2->SetLocation().SetInt().SetTo(30 + gene1->GetLocation().GetInt().GetTo());
19028 
19029     STANDARD_SETUP
19030     //AddChromosomeNoLocation(expected_errors, entry);
19031     // error no longer expected, VR-801
19032     eval = validator.Validate(seh, options);
19033     CheckErrors (*eval, expected_errors);
19034 
19035     gene2->SetData().SetGene().SetLocus("GENE1");
19036     eval = validator.Validate(seh, options);
19037     CheckErrors (*eval, expected_errors);
19038 
19039     CLEAR_ERRORS
19040 }
19041 
19042 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneXrefStrandProblem)19043 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_GeneXrefStrandProblem)
19044 {
19045     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
19046     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
19047     feat->SetGeneXref().SetLocus("gene locus");
19048     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature (feat);
19049     gene->SetLocation().SetInt().SetStrand(eNa_strand_minus);
19050     gene->SetData().SetGene().SetLocus("gene locus");
19051     unit_test_util::AddFeat(gene, entry);
19052 
19053     STANDARD_SETUP
19054     expected_errors.push_back (new CExpectedError("lcl|good", eDiag_Warning, "GeneXrefStrandProblem",
19055                               "Gene cross-reference is not on expected strand"));
19056     //AddChromosomeNoLocation(expected_errors, entry);
19057     eval = validator.Validate(seh, options);
19058     CheckErrors (*eval, expected_errors);
19059 
19060     scope.RemoveTopLevelSeqEntry(seh);
19061     unit_test_util::RevComp(entry);
19062     seh = scope.AddTopLevelSeqEntry(*entry);
19063     eval = validator.Validate(seh, options);
19064     CheckErrors (*eval, expected_errors);
19065 
19066     feat->SetGeneXref().ResetLocus();
19067     feat->SetGeneXref().SetLocus_tag("LOCUSTAG");
19068     gene->SetData().SetGene().ResetLocus();
19069     gene->SetData().SetGene().SetLocus_tag("LOCUSTAG");
19070     eval = validator.Validate(seh, options);
19071     CheckErrors (*eval, expected_errors);
19072 
19073     scope.RemoveTopLevelSeqEntry(seh);
19074     unit_test_util::RevComp(entry);
19075     seh = scope.AddTopLevelSeqEntry(*entry);
19076     eval = validator.Validate(seh, options);
19077     CheckErrors (*eval, expected_errors);
19078 
19079     CLEAR_ERRORS
19080 }
19081 
19082 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSmRNAXrefLocationProblem)19083 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_CDSmRNAXrefLocationProblem)
19084 {
19085     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
19086     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
19087     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
19088     cds->SetId().SetLocal().SetId(1);
19089     CRef<CSeqFeatXref> x1(new CSeqFeatXref());
19090     x1->SetId().SetLocal().SetId(2);
19091     cds->SetXref().push_back(x1);
19092 
19093     CRef<CSeq_feat> mrna = unit_test_util::MakemRNAForCDS(cds);
19094     mrna->SetId().SetLocal().SetId(2);
19095     CRef<CSeqFeatXref> x2(new CSeqFeatXref());
19096     x2->SetId().SetLocal().SetId(1);
19097     mrna->SetXref().push_back(x2);
19098     mrna->SetLocation().SetInt().SetTo(mrna->GetLocation().GetInt().GetTo() - 1);
19099     unit_test_util::AddFeat(mrna, nuc);
19100 
19101     STANDARD_SETUP
19102 
19103     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNAXrefLocationProblem",
19104                                "CDS not contained within cross-referenced mRNA"));
19105     expected_errors.push_back (new CExpectedError("lcl|nuc", eDiag_Warning, "CDSmRNArange",
19106                                "mRNA overlaps or contains CDS but does not completely contain intervals"));
19107     //AddChromosomeNoLocation(expected_errors, entry);
19108     eval = validator.Validate(seh, options);
19109     CheckErrors (*eval, expected_errors);
19110     CLEAR_ERRORS
19111 }
19112 
19113 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_IdenticalGeneSymbolAndSynonym)19114 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_IdenticalGeneSymbolAndSynonym)
19115 {
19116     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
19117 
19118     CRef<CSeq_feat> gene1 (new CSeq_feat());
19119     gene1->SetData().SetGene().SetLocus("gene1");
19120     gene1->SetLocation().SetInt().SetId().Assign(*(entry->GetSeq().GetId().front()));
19121     gene1->SetLocation().SetInt().SetFrom(0);
19122     gene1->SetLocation().SetInt().SetTo(3);
19123     unit_test_util::AddFeat (gene1, entry);
19124 
19125     CRef<CSeq_feat> gene2 (new CSeq_feat());
19126     gene2->SetData().SetGene().SetLocus("gene2");
19127     gene2->SetData().SetGene().SetSyn().push_back("gene1");
19128     gene2->SetLocation().SetInt().SetId().Assign(*(entry->GetSeq().GetId().front()));
19129     gene2->SetLocation().SetInt().SetFrom(4);
19130     gene2->SetLocation().SetInt().SetTo(entry->GetSeq().GetLength() - 1);
19131     unit_test_util::AddFeat (gene2, entry);
19132 
19133     STANDARD_SETUP
19134 
19135     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "IdenticalGeneSymbolAndSynonym",
19136                               "gene synonym has same value (gene1) as locus of another gene feature"));
19137     //AddChromosomeNoLocation(expected_errors, entry);
19138     eval = validator.Validate(seh, options);
19139     CheckErrors (*eval, expected_errors);
19140 
19141     CLEAR_ERRORS
19142 
19143 }
19144 
19145 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PartialProblem)19146 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_PartialProblem)
19147 {
19148     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
19149     CRef<CSeq_entry> nuc = entry->SetSet().SetSeq_set().front();
19150     CRef<CSeq_entry> prot = entry->SetSet().SetSeq_set().back();
19151     CRef<CSeq_feat> prot_feat = prot->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
19152     CRef<CSeq_feat> cds_feat = unit_test_util::GetCDSFromGoodNucProtSet(entry);
19153 
19154     // make coding region shorter, 5' partial
19155     cds_feat->SetLocation().SetInt().SetFrom(3);
19156     cds_feat->SetLocation().SetPartialStart(true, eExtreme_Biological);
19157     // shorten protein sequence
19158     prot->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("PRKTEIN");
19159     prot->SetSeq().SetInst().SetLength(7);
19160     unit_test_util::AdjustProtFeatForNucProtSet (entry);
19161     // make protein sequence 3' partial
19162     unit_test_util::SetCompleteness (prot, CMolInfo::eCompleteness_no_right);
19163 
19164 
19165     STANDARD_SETUP
19166 
19167     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistentCDSProtein",
19168                                                  "Coding region and protein feature partials conflict"));
19169     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus5Prime",
19170                               "5' partial is not at beginning of sequence, gap, or consensus splice site"));
19171     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialsInconsistent",
19172                               "Inconsistent: Product= partial, Location= partial, Feature.partial= FALSE"));
19173     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop", "Got stop codon, but 3'end is labeled partial"));
19174     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem", "CDS is 3' complete but protein is CO2 partial"));
19175     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblem", "CDS is 5' partial but protein is CO2 partial"));
19176     //AddChromosomeNoLocation(expected_errors, entry);
19177     eval = validator.Validate(seh, options);
19178     CheckErrors (*eval, expected_errors);
19179 
19180     // set partial on CDS, third error should go away
19181     cds_feat->SetPartial (true);
19182     delete expected_errors[2];
19183     expected_errors[2] = NULL;
19184     eval = validator.Validate(seh, options);
19185     CheckErrors (*eval, expected_errors);
19186 
19187     CLEAR_ERRORS
19188 }
19189 
19190 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ProteinNameEndsInBracket)19191 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ProteinNameEndsInBracket)
19192 {
19193     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
19194     unit_test_util::SetNucProtSetProductName (entry, "something [ends with bracket]");
19195 
19196     STANDARD_SETUP
19197     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "ProteinNameEndsInBracket",
19198                               "Protein name ends with bracket and may contain organism name"));
19199     //AddChromosomeNoLocation(expected_errors, entry);
19200     eval = validator.Validate(seh, options);
19201     CheckErrors (*eval, expected_errors);
19202 
19203     // report if no beginning bracket
19204     unit_test_util::SetNucProtSetProductName (entry, "something NAD with bracket]");
19205     eval = validator.Validate(seh, options);
19206     CheckErrors (*eval, expected_errors);
19207 
19208     CLEAR_ERRORS
19209     // no report if [NAD
19210 
19211     unit_test_util::SetNucProtSetProductName (entry, "something [NAD with bracket]");
19212     //AddChromosomeNoLocation(expected_errors, entry);
19213     eval = validator.Validate(seh, options);
19214     CheckErrors (*eval, expected_errors);
19215 
19216     CLEAR_ERRORS
19217 }
19218 
19219 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ShortIntron)19220 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ShortIntron)
19221 {
19222     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
19223     CRef<CSeq_id> id = entry->SetSeq().SetId().front();
19224 
19225     // add gene
19226     CRef<CSeq_feat> gene (new CSeq_feat());
19227     gene->SetData().SetGene().SetLocus("locus");
19228     gene->SetLocation().SetInt().SetFrom(0);
19229     gene->SetLocation().SetInt().SetTo(59);
19230     gene->SetLocation().SetInt().SetId().Assign(*id);
19231     unit_test_util::AddFeat(gene, entry);
19232 
19233     // add coding region
19234     CRef<CSeq_feat> cds (new CSeq_feat());
19235     cds->SetData().SetCdregion();
19236 
19237     CRef<CSeq_loc> loc1(new CSeq_loc());
19238     loc1->SetInt().SetFrom(0);
19239     loc1->SetInt().SetTo(15);
19240     loc1->SetInt().SetId().Assign(*id);
19241 
19242     CRef<CSeq_loc> loc2(new CSeq_loc());
19243     loc2->SetInt().SetFrom(19);
19244     loc2->SetInt().SetTo(59);
19245     loc2->SetInt().SetId().Assign(*id);
19246 
19247     cds->SetLocation().SetMix().Set().push_back(loc1);
19248     cds->SetLocation().SetMix().Set().push_back(loc2);
19249     unit_test_util::AddFeat(cds, entry);
19250 
19251     // add intron
19252     CRef<CSeq_feat> intron (new CSeq_feat());
19253     intron->SetData().SetImp().SetKey("intron");
19254     intron->SetLocation().SetInt().SetFrom(16);
19255     intron->SetLocation().SetInt().SetTo(18);
19256     intron->SetLocation().SetInt().SetId().Assign(*id);
19257     unit_test_util::AddFeat(intron, entry);
19258 
19259     STANDARD_SETUP
19260 
19261     BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
19262     BOOST_CHECK_EQUAL(validator::HasNoStop(*cds, &scope), true);
19263 
19264     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "StartCodon",
19265                               "Illegal start codon used. Wrong genetic code [0] or protein should be partial"));
19266     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoStop",
19267                               "Missing stop codon"));
19268     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19269                               "Splice donor consensus (GT) not found after exon ending at position 16 of lcl|good"));
19270     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19271                               "Splice acceptor consensus (AG) not found before exon starting at position 20 of lcl|good"));
19272     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingCDSproduct",
19273                               "Expected CDS product absent"));
19274     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortIntron",
19275                               "Introns should be at least 10 nt long"));
19276     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortIntron",
19277                               "Introns at positions 16-20 should be at least 10 nt long"));
19278     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19279                               "Splice donor consensus (GT) not found at start of intron, position 17 of lcl|good"));
19280     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19281                               "Splice acceptor consensus (AG) not found at end of intron, position 19 of lcl|good"));
19282     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoProtein", "No protein Bioseq given"));
19283     //AddChromosomeNoLocation(expected_errors, entry);
19284     eval = validator.Validate(seh, options);
19285     CheckErrors (*eval, expected_errors);
19286 
19287     // set CDS pseudo, one ShortIntron error should go away
19288     cds->SetPseudo(true);
19289     CLEAR_ERRORS
19290     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortIntron",
19291                               "Introns should be at least 10 nt long"));
19292     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19293                               "Splice donor consensus (GT) not found at start of intron, position 17 of lcl|good"));
19294     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19295                               "Splice acceptor consensus (AG) not found at end of intron, position 19 of lcl|good"));
19296     //AddChromosomeNoLocation(expected_errors, entry);
19297 
19298 
19299     eval = validator.Validate(seh, options);
19300     CheckErrors (*eval, expected_errors);
19301 
19302     // make cds not pseudo, intron pseudo, should still get one ShortIntron error
19303     cds->ResetPseudo();
19304     intron->SetPseudo(true);
19305 
19306     BOOST_CHECK_EQUAL(validator::HasBadStartCodon(*cds, scope, false), true);
19307     BOOST_CHECK_EQUAL(validator::HasNoStop(*cds, &scope), true);
19308 
19309     CLEAR_ERRORS
19310     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "StartCodon",
19311                               "Illegal start codon used. Wrong genetic code [0] or protein should be partial"));
19312     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoStop",
19313                               "Missing stop codon"));
19314     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19315                               "Splice donor consensus (GT) not found after exon ending at position 16 of lcl|good"));
19316     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19317                               "Splice acceptor consensus (AG) not found before exon starting at position 20 of lcl|good"));
19318     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingCDSproduct",
19319                               "Expected CDS product absent"));
19320     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortIntron",
19321                               "Introns at positions 16-20 should be at least 10 nt long"));
19322     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19323                               "Splice donor consensus (GT) not found at start of intron, position 17 of lcl|good"));
19324     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19325                               "Splice acceptor consensus (AG) not found at end of intron, position 19 of lcl|good"));
19326     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoProtein", "No protein Bioseq given"));
19327     //AddChromosomeNoLocation(expected_errors, entry);
19328     eval = validator.Validate(seh, options);
19329     CheckErrors (*eval, expected_errors);
19330 
19331     // clear both pseudo, make gene pseudo, both errors should go away
19332     intron->ResetPseudo();
19333     gene->SetPseudo(true);
19334     CLEAR_ERRORS
19335 
19336     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19337                               "Splice donor consensus (GT) not found at start of intron, position 17 of lcl|good"));
19338     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19339                               "Splice acceptor consensus (AG) not found at end of intron, position 19 of lcl|good"));
19340     //AddChromosomeNoLocation(expected_errors, entry);
19341 
19342     eval = validator.Validate(seh, options);
19343     CheckErrors (*eval, expected_errors);
19344     CLEAR_ERRORS
19345 
19346     // clear all pseudos
19347     gene->ResetPseudo();
19348     // nonsense intron silences coding region shortintron message
19349     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGGCCAAAATTGGTAAAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAA");
19350 
19351     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoProtein",
19352         "No protein Bioseq given"));
19353     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "IntronIsStopCodon",
19354         "Triplet intron encodes stop codon"));
19355     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "StartCodon",
19356                               "Illegal start codon used. Wrong genetic code [0] or protein should be partial"));
19357     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "NoStop",
19358                               "Missing stop codon"));
19359     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MissingCDSproduct",
19360                               "Expected CDS product absent"));
19361     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "ShortIntron",
19362                               "Introns should be at least 10 nt long"));
19363     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19364                               "Splice donor consensus (GT) not found after exon ending at position 16 of lcl|good"));
19365     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19366                               "Splice acceptor consensus (AG) not found before exon starting at position 20 of lcl|good"));
19367     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusDonor",
19368                               "Splice donor consensus (GT) not found at start of intron, position 17 of lcl|good"));
19369     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NotSpliceConsensusAcceptor",
19370                               "Splice acceptor consensus (AG) not found at end of intron, position 19 of lcl|good"));
19371     //AddChromosomeNoLocation(expected_errors, entry);
19372 
19373     eval = validator.Validate(seh, options);
19374     CheckErrors (*eval, expected_errors);
19375     CLEAR_ERRORS
19376 }
19377 
19378 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NeedsNote)19379 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_NeedsNote)
19380 {
19381     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
19382     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
19383     misc->ResetComment();
19384 
19385     STANDARD_SETUP
19386 
19387     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MiscFeatureNeedsNote",
19388                               "A note or other qualifier is required for a misc_feature"));
19389     //AddChromosomeNoLocation(expected_errors, entry);
19390     eval = validator.Validate(seh, options);
19391     CheckErrors (*eval, expected_errors);
19392 
19393     CLEAR_ERRORS
19394 }
19395 
19396 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RptUnitRangeProblem)19397 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_RptUnitRangeProblem)
19398 {
19399     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
19400     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
19401     misc->SetData().SetImp().SetKey("repeat_region");
19402     CRef<CGb_qual> qual(new CGb_qual());
19403     qual->SetQual("rpt_unit_range");
19404     qual->SetVal("1..70");
19405     misc->SetQual().push_back(qual);
19406 
19407     STANDARD_SETUP
19408 
19409     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "RptUnitRangeProblem",
19410                               "/rpt_unit_range is not within sequence length"));
19411     //AddChromosomeNoLocation(expected_errors, entry);
19412     eval = validator.Validate(seh, options);
19413     CheckErrors (*eval, expected_errors);
19414 
19415     CLEAR_ERRORS
19416 }
19417 
19418 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TooManyInferenceAccessions)19419 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_TooManyInferenceAccessions)
19420 {
19421     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
19422 
19423     for (int i = 0; i < 50; i++) {
19424         CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry, i + 10);
19425         for (int j = 0; j < 10; j++) {
19426             CRef<CGb_qual> qual(new CGb_qual());
19427             qual->SetQual("inference");
19428             string val = "similar to DNA sequence:";
19429             for (int k = 0; k < 10; k++) {
19430                 val += "INSD:AY" + NStr::IntToString (k + j * 100 + 123400) + ".1";
19431                 if (k < 9) {
19432                     val += ",";
19433                 }
19434             }
19435             qual->SetVal(val);
19436             misc->SetQual().push_back(qual);
19437         }
19438     }
19439     STANDARD_SETUP
19440 
19441     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "TooManyInferenceAccessions",
19442                               "Skipping validation of 500 /inference qualifiers with 5000 accessions"));
19443     //AddChromosomeNoLocation(expected_errors, entry);
19444     eval = validator.Validate(seh, options | CValidator::eVal_inference_accns);
19445     CheckErrors (*eval, expected_errors);
19446 
19447     CLEAR_ERRORS
19448 }
19449 
19450 
BuildSetAlign(CRef<CSeq_entry> entry)19451 static CRef<CSeq_align> BuildSetAlign(CRef<CSeq_entry> entry)
19452 {
19453     CRef<CSeq_align> align(new CSeq_align());
19454     align->SetType(CSeq_align::eType_global);
19455     align->SetSegs().SetDenseg().SetNumseg(1);
19456 
19457     int dim = 0;
19458     int len = 0;
19459 
19460     FOR_EACH_SEQENTRY_ON_SEQSET (s, entry->GetSet()) {
19461         dim++;
19462         CRef<CSeq_id> id(new CSeq_id());
19463         id->Assign(*((*s)->GetSeq().GetId().front()));
19464         align->SetSegs().SetDenseg().SetIds().push_back(id);
19465         align->SetSegs().SetDenseg().SetStarts().push_back(0);
19466 
19467         len = (*s)->GetSeq().GetInst().GetLength();
19468     }
19469     align->SetDim(dim);
19470     align->SetSegs().SetDenseg().SetDim(dim);
19471     align->SetSegs().SetDenseg().SetLens().push_back(len);
19472 
19473     return align;
19474 }
19475 
19476 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_SeqIdProblem,CGenBankFixture)19477 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_SeqIdProblem, CGenBankFixture)
19478 {
19479     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
19480     CRef<CSeq_annot> annot(new CSeq_annot());
19481     CRef<CSeq_align> align = BuildSetAlign(entry);
19482     align->SetSegs().SetDenseg().SetIds().back()->SetLocal().SetStr("good4");
19483     annot->SetData().SetAlign().push_back(align);
19484     entry->SetSet().SetAnnot().push_back(annot);
19485 
19486     STANDARD_SETUP
19487 
19488 
19489     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "FastaLike",
19490       "Fasta: This may be a fasta-like alignment for SeqId: lcl|good1 in the context of good1"));
19491     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "SeqIdProblem",
19492                               "SeqId: The sequence corresponding to SeqId lcl|good4 could not be found."));
19493     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "PercentIdentity",
19494       "PercentIdentity: This alignment has a percent identity of 0%"));
19495     //AddChromosomeNoLocation(expected_errors, entry);
19496     options |= CValidator::eVal_val_align | CValidator::eVal_remote_fetch;
19497     eval = validator.Validate(seh, options);
19498     CheckErrors (*eval, expected_errors);
19499     CLEAR_ERRORS
19500 }
19501 
19502 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_DensegLenStart,CGenBankFixture)19503 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_DensegLenStart, CGenBankFixture)
19504 {
19505     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
19506 
19507     CRef<CSeq_align> align(new CSeq_align());
19508     align->SetType(CSeq_align::eType_global);
19509     align->SetSegs().SetDenseg().SetNumseg(2);
19510 
19511     int dim = 0;
19512 
19513     FOR_EACH_SEQENTRY_ON_SEQSET (s, entry->GetSet()) {
19514         dim++;
19515         CRef<CSeq_id> id(new CSeq_id());
19516         id->Assign(*((*s)->GetSeq().GetId().front()));
19517         align->SetSegs().SetDenseg().SetIds().push_back(id);
19518         align->SetSegs().SetDenseg().SetStarts().push_back(0);
19519     }
19520     align->SetDim(dim);
19521     align->SetSegs().SetDenseg().SetDim(dim);
19522 
19523     align->SetSegs().SetDenseg().SetLens().push_back(5);
19524     align->SetSegs().SetDenseg().SetStarts().push_back(5);
19525     align->SetSegs().SetDenseg().SetStarts().push_back(6);
19526     align->SetSegs().SetDenseg().SetStarts().push_back(5);
19527     align->SetSegs().SetDenseg().SetLens().push_back(10);
19528 
19529     CRef<CSeq_annot> annot(new CSeq_annot());
19530     annot->SetData().SetAlign().push_back(align);
19531     entry->SetSet().SetAnnot().push_back(annot);
19532 
19533     STANDARD_SETUP
19534 
19535     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "DensegLenStart",
19536              "Start/Length: There is a problem with sequence lcl|good2, in segment 1 (near sequence position 0), context good1: the segment is too long or short or the next segment has an incorrect start position"));
19537     //AddChromosomeNoLocation(expected_errors, entry);
19538     options |= CValidator::eVal_val_align | CValidator::eVal_remote_fetch;
19539     eval = validator.Validate(seh, options);
19540     CheckErrors (*eval, expected_errors);
19541     CLEAR_ERRORS
19542 }
19543 
19544 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_SumLenStart,CGenBankFixture)19545 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_SumLenStart, CGenBankFixture)
19546 {
19547     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
19548     CRef<CSeq_align> align = BuildSetAlign(entry);
19549     align->SetSegs().SetDenseg().SetNumseg(2);
19550     align->SetSegs().SetDenseg().SetLens()[0] = 5;
19551     align->SetSegs().SetDenseg().SetLens().push_back(60);
19552 
19553     align->SetSegs().SetDenseg().SetStarts().push_back(5);
19554     align->SetSegs().SetDenseg().SetStarts().push_back(5);
19555     align->SetSegs().SetDenseg().SetStarts().push_back(5);
19556 
19557     CRef<CSeq_annot> annot(new CSeq_annot());
19558     annot->SetData().SetAlign().push_back(align);
19559     entry->SetSet().SetAnnot().push_back(annot);
19560 
19561     STANDARD_SETUP
19562 
19563     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "SumLenStart",
19564                   "Start: In sequence lcl|good1, segment 2 (near sequence position 5) context good1, the alignment claims to contain residue coordinates that are past the end of the sequence.  Either the sequence is too short, or there are extra characters or formatting errors in the alignment"));
19565     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "SumLenStart",
19566                   "Start: In sequence lcl|good2, segment 2 (near sequence position 5) context good1, the alignment claims to contain residue coordinates that are past the end of the sequence.  Either the sequence is too short, or there are extra characters or formatting errors in the alignment"));
19567     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "SumLenStart",
19568                   "Start: In sequence lcl|good3, segment 2 (near sequence position 5) context good1, the alignment claims to contain residue coordinates that are past the end of the sequence.  Either the sequence is too short, or there are extra characters or formatting errors in the alignment"));
19569     //AddChromosomeNoLocation(expected_errors, entry);
19570     options |= CValidator::eVal_val_align | CValidator::eVal_remote_fetch;
19571     eval = validator.Validate(seh, options);
19572     CheckErrors (*eval, expected_errors);
19573     CLEAR_ERRORS
19574 }
19575 
19576 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_AlignDimSeqIdNotMatch,CGenBankFixture)19577 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_AlignDimSeqIdNotMatch, CGenBankFixture)
19578 {
19579     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
19580     CRef<CSeq_align> align = BuildSetAlign(entry);
19581     align->SetSegs().SetDenseg().SetDim(4);
19582 
19583     CRef<CSeq_annot> annot(new CSeq_annot());
19584     annot->SetData().SetAlign().push_back(align);
19585     entry->SetSet().SetAnnot().push_back(annot);
19586 
19587     SetDiagFilter(eDiagFilter_All, "!(1207.5)");
19588     STANDARD_SETUP
19589     SetDiagFilter(eDiagFilter_All, "");
19590 
19591     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "AlignDimSeqIdNotMatch",
19592                   "SeqId: The Seqalign has more or fewer ids than the number of rows in the alignment (context good1).  Look for possible formatting errors in the ids."));
19593     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "SegsStartsMismatch",
19594                   "The number of Starts (3) does not match the expected size of dim * numseg (4)"));
19595     //AddChromosomeNoLocation(expected_errors, entry);
19596     options |= CValidator::eVal_val_align | CValidator::eVal_remote_fetch;
19597     eval = validator.Validate(seh, options);
19598     CheckErrors (*eval, expected_errors);
19599     CLEAR_ERRORS
19600 }
19601 
19602 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_FastaLike,CGenBankFixture)19603 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_FastaLike, CGenBankFixture)
19604 {
19605     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
19606     unit_test_util::RevComp(entry->SetSet().SetSeq_set().front());
19607     CRef<CSeq_align> align = BuildSetAlign(entry);
19608 
19609     CRef<CSeq_annot> annot(new CSeq_annot());
19610     annot->SetData().SetAlign().push_back(align);
19611     entry->SetSet().SetAnnot().push_back(annot);
19612 
19613     STANDARD_SETUP
19614 
19615     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "FastaLike",
19616                   "Fasta: This may be a fasta-like alignment for SeqId: lcl|good1 in the context of good1"));
19617     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "PercentIdentity",
19618                   "PercentIdentity: This alignment has a percent identity of 0%"));
19619     //AddChromosomeNoLocation(expected_errors, entry);
19620     options |= CValidator::eVal_val_align | CValidator::eVal_remote_fetch;
19621     eval = validator.Validate(seh, options);
19622     CheckErrors (*eval, expected_errors);
19623 
19624     // fasta like error should disappear if there are 5' gaps or internal gaps
19625     align->SetSegs().SetDenseg().SetNumseg(2);
19626     align->SetSegs().SetDenseg().SetLens()[0] = 5;
19627     align->SetSegs().SetDenseg().SetLens().push_back(55);
19628     align->SetSegs().SetDenseg().SetStarts()[2] = -1;
19629     align->SetSegs().SetDenseg().SetStarts().push_back(5);
19630     align->SetSegs().SetDenseg().SetStarts().push_back(5);
19631     align->SetSegs().SetDenseg().SetStarts().push_back(5);
19632 
19633     CLEAR_ERRORS
19634     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "PercentIdentity",
19635                   "PercentIdentity: This alignment has a percent identity of 0%"));
19636     //AddChromosomeNoLocation(expected_errors, entry);
19637 
19638     eval = validator.Validate(seh, options);
19639     CheckErrors (*eval, expected_errors);
19640 
19641     CLEAR_ERRORS
19642 }
19643 
19644 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_NullSegs,CGenBankFixture)19645 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_NullSegs, CGenBankFixture)
19646 {
19647     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
19648     CRef<CSeq_align> align = BuildSetAlign(entry);
19649     align->ResetSegs();
19650 
19651     CRef<CSeq_annot> annot(new CSeq_annot());
19652     annot->SetData().SetAlign().push_back(align);
19653     entry->SetSet().SetAnnot().push_back(annot);
19654 
19655     STANDARD_SETUP
19656 
19657     expected_errors.push_back(new CExpectedError("", eDiag_Error, "NullSegs",
19658                   "Segs: This alignment is missing all segments.  This is a non-correctable error -- look for serious formatting problems."));
19659     //AddChromosomeNoLocation(expected_errors, entry);
19660     options |= CValidator::eVal_val_align | CValidator::eVal_remote_fetch;
19661     eval = validator.Validate(seh, options);
19662     CheckErrors (*eval, expected_errors);
19663 
19664     CLEAR_ERRORS
19665 }
19666 
19667 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_SegmentGap,CGenBankFixture)19668 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_SegmentGap, CGenBankFixture)
19669 {
19670     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
19671     CRef<CSeq_align> align = BuildSetAlign(entry);
19672     align->SetSegs().SetDenseg().SetNumseg(3);
19673     align->SetSegs().SetDenseg().SetLens()[0] = 5;
19674     align->SetSegs().SetDenseg().SetLens().push_back(10);
19675     align->SetSegs().SetDenseg().SetLens().push_back(55);
19676     align->SetSegs().SetDenseg().SetStarts().push_back(-1);
19677     align->SetSegs().SetDenseg().SetStarts().push_back(-1);
19678     align->SetSegs().SetDenseg().SetStarts().push_back(-1);
19679     align->SetSegs().SetDenseg().SetStarts().push_back(5);
19680     align->SetSegs().SetDenseg().SetStarts().push_back(5);
19681     align->SetSegs().SetDenseg().SetStarts().push_back(5);
19682 
19683     CRef<CSeq_annot> annot(new CSeq_annot());
19684     annot->SetData().SetAlign().push_back(align);
19685     entry->SetSet().SetAnnot().push_back(annot);
19686 
19687     STANDARD_SETUP
19688 
19689     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "SegmentGap",
19690                   "Segs: Segment 2 (near alignment position 5) in the context of good1 contains only gaps.  Each segment must contain at least one actual sequence -- look for columns with all gaps and delete them."));
19691     //AddChromosomeNoLocation(expected_errors, entry);
19692     options |= CValidator::eVal_val_align | CValidator::eVal_remote_fetch;
19693     eval = validator.Validate(seh, options);
19694     CheckErrors (*eval, expected_errors);
19695 
19696     CLEAR_ERRORS
19697 }
19698 
19699 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_AlignDimOne,CGenBankFixture)19700 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_AlignDimOne, CGenBankFixture)
19701 {
19702     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
19703     CRef<CSeq_align> align = BuildSetAlign(entry);
19704     align->SetSegs().SetDenseg().SetDim(1);
19705     align->SetSegs().SetDenseg().SetIds().pop_back();
19706     align->SetSegs().SetDenseg().SetIds().pop_back();
19707     align->SetSegs().SetDenseg().SetStarts().pop_back();
19708     align->SetSegs().SetDenseg().SetStarts().pop_back();
19709 
19710     CRef<CSeq_annot> annot(new CSeq_annot());
19711     annot->SetData().SetAlign().push_back(align);
19712     entry->SetSet().SetAnnot().push_back(annot);
19713 
19714     STANDARD_SETUP
19715 
19716     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "AlignDimOne",
19717                   "Dim: This seqalign apparently has only one sequence.  Each alignment must have at least two sequences.  context lcl|good1"));
19718     //AddChromosomeNoLocation(expected_errors, entry);
19719     options |= CValidator::eVal_val_align | CValidator::eVal_remote_fetch;
19720     eval = validator.Validate(seh, options);
19721     CheckErrors (*eval, expected_errors);
19722 
19723     CLEAR_ERRORS
19724 }
19725 
19726 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_Segtype,CGenBankFixture)19727 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_Segtype, CGenBankFixture)
19728 {
19729     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
19730     CRef<CSeq_align> align(new CSeq_align());
19731     align->SetSegs().SetSparse();
19732 
19733     CRef<CSeq_annot> annot(new CSeq_annot());
19734     annot->SetData().SetAlign().push_back(align);
19735     entry->SetSet().SetAnnot().push_back(annot);
19736 
19737     STANDARD_SETUP
19738 
19739     expected_errors.push_back(new CExpectedError("", eDiag_Warning, "Segtype",
19740                   "Segs: This alignment has an undefined or unsupported Seqalign segtype 7 (alignment number 1)"));
19741     //AddChromosomeNoLocation(expected_errors, entry);
19742     options |= CValidator::eVal_val_align | CValidator::eVal_remote_fetch;
19743     eval = validator.Validate(seh, options);
19744     CheckErrors (*eval, expected_errors);
19745 
19746     align->SetSegs().SetSpliced();
19747     expected_errors[0]->SetErrMsg("Segs: This alignment has an undefined or unsupported Seqalign segtype 6 (alignment number 1)");
19748     eval = validator.Validate(seh, options);
19749     CheckErrors (*eval, expected_errors);
19750 
19751     CLEAR_ERRORS
19752 }
19753 
19754 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_BlastAligns,CGenBankFixture)19755 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_BlastAligns, CGenBankFixture)
19756 {
19757     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
19758     CRef<CSeq_align> align = BuildSetAlign(entry);
19759 
19760     CRef<CSeq_annot> annot(new CSeq_annot());
19761     annot->SetData().SetAlign().push_back(align);
19762 
19763     CRef<CAnnotdesc> ad(new CAnnotdesc());
19764     ad->SetUser().SetType().SetStr("Blast Type");
19765     annot->SetDesc().Set().push_back(ad);
19766     entry->SetSet().SetAnnot().push_back(annot);
19767 
19768     STANDARD_SETUP
19769 
19770     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "BlastAligns",
19771                   "Record contains BLAST alignments"));
19772     //AddChromosomeNoLocation(expected_errors, entry);
19773     options |= CValidator::eVal_val_align | CValidator::eVal_remote_fetch;
19774     eval = validator.Validate(seh, options);
19775     CheckErrors (*eval, expected_errors);
19776 
19777     CLEAR_ERRORS
19778 }
19779 
19780 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_PercentIdentity,CGenBankFixture)19781 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_PercentIdentity, CGenBankFixture)
19782 {
19783     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
19784     entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTGGCCAAAATTGGCCAA");
19785     CRef<CSeq_align> align = BuildSetAlign(entry);
19786 
19787     CRef<CSeq_annot> annot(new CSeq_annot());
19788     annot->SetData().SetAlign().push_back(align);
19789 
19790     entry->SetSet().SetAnnot().push_back(annot);
19791 
19792     STANDARD_SETUP
19793 
19794     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "FastaLike",
19795       "Fasta: This may be a fasta-like alignment for SeqId: lcl|good1 in the context of good1"));
19796     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "PercentIdentity",
19797                               "PercentIdentity: This alignment has a percent identity of 43%"));
19798     //AddChromosomeNoLocation(expected_errors, entry);
19799     options |= CValidator::eVal_val_align | CValidator::eVal_remote_fetch;
19800     eval = validator.Validate(seh, options);
19801     CheckErrors (*eval, expected_errors);
19802 
19803     CLEAR_ERRORS
19804 }
19805 
19806 
19807 
BuildSetDendiagAlign(CRef<CSeq_entry> entry)19808 static CRef<CSeq_align> BuildSetDendiagAlign(CRef<CSeq_entry> entry)
19809 {
19810     CRef<CSeq_align> align(new CSeq_align());
19811     align->SetType(CSeq_align::eType_global);
19812 
19813     CRef<CDense_diag> diag(new CDense_diag());
19814 
19815 
19816     int dim = 0;
19817     int len = 0;
19818 
19819     FOR_EACH_SEQENTRY_ON_SEQSET (s, entry->GetSet()) {
19820         dim++;
19821         CRef<CSeq_id> id(new CSeq_id());
19822         id->Assign(*((*s)->GetSeq().GetId().front()));
19823         diag->SetIds().push_back(id);
19824         diag->SetStarts().push_back(0);
19825 
19826         len = (*s)->GetSeq().GetInst().GetLength();
19827     }
19828     align->SetDim(dim);
19829     diag->SetDim(dim);
19830     diag->SetLen(len);
19831     align->SetSegs().SetDendiag().push_back(diag);
19832 
19833     return align;
19834 }
19835 
19836 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_UnexpectedAlignmentType,CGenBankFixture)19837 BOOST_FIXTURE_TEST_CASE(Test_SEQ_ALIGN_UnexpectedAlignmentType, CGenBankFixture)
19838 {
19839     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
19840     CRef<CSeq_align> align = BuildSetDendiagAlign(entry);
19841 
19842     CRef<CSeq_annot> annot(new CSeq_annot());
19843     annot->SetData().SetAlign().push_back(align);
19844 
19845     entry->SetSet().SetAnnot().push_back(annot);
19846 
19847     STANDARD_SETUP
19848 
19849     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "UnexpectedAlignmentType",
19850                               "UnexpectedAlignmentType: This is not a DenseSeg alignment."));
19851     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Warning, "PercentIdentity",
19852                               "PercentIdentity: This alignment has a percent identity of 0%"));
19853     //AddChromosomeNoLocation(expected_errors, entry);
19854     options |= CValidator::eVal_val_align | CValidator::eVal_remote_fetch;
19855     eval = validator.Validate(seh, options);
19856     CheckErrors (*eval, expected_errors);
19857 
19858     CLEAR_ERRORS
19859 }
19860 
19861 
BuildGoodByteGraph(CRef<CSeq_entry> entry,TSeqPos offset=0,TSeqPos len=kInvalidSeqPos)19862 static CRef<CSeq_graph> BuildGoodByteGraph(CRef<CSeq_entry> entry, TSeqPos offset = 0, TSeqPos len = kInvalidSeqPos)
19863 {
19864     CRef<CSeq_graph> graph (new CSeq_graph());
19865     graph->SetTitle("Phrap Quality");
19866     if (len == kInvalidSeqPos) {
19867       len = entry->GetSeq().GetInst().GetLength() - offset;
19868     }
19869     graph->SetNumval(len);
19870     graph->SetLoc().SetInt().SetFrom(offset);
19871     graph->SetLoc().SetInt().SetTo(offset + len - 1);
19872     graph->SetLoc().SetInt().SetId().Assign(*(entry->GetSeq().GetId().front()));
19873 
19874     for (size_t pos = 0; pos < len; pos++) {
19875         graph->SetGraph().SetByte().SetValues().push_back(40);
19876     }
19877 
19878 
19879     graph->SetGraph().SetByte().SetMax(40);
19880     graph->SetGraph().SetByte().SetMin(40);
19881     graph->SetGraph().SetByte().SetAxis(40);
19882     return graph;
19883 }
19884 
19885 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphMin)19886 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphMin)
19887 {
19888     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
19889     CRef<CSeq_graph> graph = BuildGoodByteGraph(entry);
19890     graph->SetGraph().SetByte().SetMin(-1);
19891     CRef<CSeq_annot> annot(new CSeq_annot());
19892     annot->SetData().SetGraph().push_back(graph);
19893     entry->SetSeq().SetAnnot().push_back(annot);
19894 
19895     STANDARD_SETUP
19896 
19897     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphMin",
19898                               "Graph min (-1) out of range"));
19899     //AddChromosomeNoLocation(expected_errors, entry);
19900     eval = validator.Validate(seh, options);
19901     CheckErrors (*eval, expected_errors);
19902 
19903     graph->SetGraph().SetByte().SetMin(101);
19904     expected_errors[0]->SetErrMsg("Graph min (101) out of range");
19905     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphBelow",
19906                               "60 quality scores have values below the reported minimum or 0"));
19907     eval = validator.Validate(seh, options);
19908     CheckErrors (*eval, expected_errors);
19909 
19910     CLEAR_ERRORS
19911 }
19912 
19913 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphMax)19914 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphMax)
19915 {
19916     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
19917     CRef<CSeq_graph> graph = BuildGoodByteGraph(entry);
19918     graph->SetGraph().SetByte().SetMax(-1);
19919     CRef<CSeq_annot> annot(new CSeq_annot());
19920     annot->SetData().SetGraph().push_back(graph);
19921     entry->SetSeq().SetAnnot().push_back(annot);
19922 
19923     STANDARD_SETUP
19924 
19925     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphMax",
19926                               "Graph max (-1) out of range"));
19927     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphAbove",
19928                               "60 quality scores have values above the reported maximum or 100"));
19929     //AddChromosomeNoLocation(expected_errors, entry);
19930     eval = validator.Validate(seh, options);
19931     CheckErrors (*eval, expected_errors);
19932 
19933     delete expected_errors[1];
19934     expected_errors[1] = NULL;
19935 
19936     graph->SetGraph().SetByte().SetMax(101);
19937     expected_errors[0]->SetErrMsg("Graph max (101) out of range");
19938     expected_errors[0]->SetSeverity(eDiag_Warning);
19939     eval = validator.Validate(seh, options);
19940     CheckErrors (*eval, expected_errors);
19941 
19942     CLEAR_ERRORS
19943 }
19944 
19945 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphByteLen)19946 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphByteLen)
19947 {
19948     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
19949     CRef<CSeq_graph> graph = BuildGoodByteGraph(entry);
19950     graph->SetNumval(40);
19951     CRef<CSeq_annot> annot(new CSeq_annot());
19952     annot->SetData().SetGraph().push_back(graph);
19953     entry->SetSeq().SetAnnot().push_back(annot);
19954 
19955     STANDARD_SETUP
19956 
19957     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphByteLen",
19958                               "SeqGraph (40) and ByteStore (60) length mismatch"));
19959     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphBioseqLen",
19960                               "SeqGraph (40) and Bioseq (60) length mismatch"));
19961     //AddChromosomeNoLocation(expected_errors, entry);
19962     eval = validator.Validate(seh, options);
19963     CheckErrors (*eval, expected_errors);
19964 
19965     CLEAR_ERRORS
19966 }
19967 
19968 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphOutOfOrder)19969 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphOutOfOrder)
19970 {
19971     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
19972     CRef<CSeq_annot> annot(new CSeq_annot());
19973     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 20, 20));
19974     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 20));
19975     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 40, 20));
19976     entry->SetSeq().SetAnnot().push_back(annot);
19977 
19978     STANDARD_SETUP
19979 
19980     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphOutOfOrder",
19981                               "Graph components are out of order - may be a software bug"));
19982     //AddChromosomeNoLocation(expected_errors, entry);
19983     eval = validator.Validate(seh, options);
19984     CheckErrors (*eval, expected_errors);
19985 
19986     CLEAR_ERRORS
19987 }
19988 
19989 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphSeqLitLen)19990 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphSeqLitLen)
19991 {
19992     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
19993     CRef<CSeq_annot> annot(new CSeq_annot());
19994     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 11));
19995     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 22, 12));
19996     entry->SetSeq().SetAnnot().push_back(annot);
19997 
19998     STANDARD_SETUP
19999 
20000     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphBioseqLen",
20001                               "SeqGraph (23) and Bioseq (24) length mismatch"));
20002     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphSeqLitLen",
20003                               "SeqGraph (11) and SeqLit (12) length mismatch"));
20004     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphStopPhase",
20005                               "SeqGraph (10) and SeqLit (11) stop do not coincide"));
20006     //AddChromosomeNoLocation(expected_errors, entry);
20007     eval = validator.Validate(seh, options);
20008     CheckErrors (*eval, expected_errors);
20009 
20010     CLEAR_ERRORS
20011 }
20012 
20013 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_GRAPH_GraphSeqLocLen,CGenBankFixture)20014 BOOST_FIXTURE_TEST_CASE(Test_SEQ_GRAPH_GraphSeqLocLen, CGenBankFixture)
20015 {
20016     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
20017     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGenbank().SetAccession("AY123456");
20018     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
20019     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(11);
20020 
20021     CRef<CSeq_annot> annot(new CSeq_annot());
20022     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 13));
20023     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 22, 12));
20024     entry->SetSeq().SetAnnot().push_back(annot);
20025 
20026     STANDARD_SETUP
20027 
20028     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphGapScore",
20029                               "1 gap bases have positive score value"));
20030     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphBioseqLen",
20031                               "SeqGraph (25) and Bioseq (24) length mismatch"));
20032     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphSeqLocLen",
20033                               "SeqGraph (13) and SeqLoc (12) length mismatch"));
20034     //AddChromosomeNoLocation(expected_errors, entry);
20035     eval = validator.Validate(seh, options);
20036     CheckErrors (*eval, expected_errors);
20037 
20038     CLEAR_ERRORS
20039 }
20040 
20041 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphStartPhase)20042 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphStartPhase)
20043 {
20044     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
20045     CRef<CSeq_annot> annot(new CSeq_annot());
20046     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 12));
20047     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 21, 13));
20048     entry->SetSeq().SetAnnot().push_back(annot);
20049 
20050     STANDARD_SETUP
20051 
20052     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphGapScore",
20053                               "1 gap bases have positive score value"));
20054     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphBioseqLen",
20055                               "SeqGraph (25) and Bioseq (24) length mismatch"));
20056     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphSeqLitLen",
20057                               "SeqGraph (13) and SeqLit (12) length mismatch"));
20058     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphStartPhase",
20059                               "SeqGraph (21) and SeqLit (22) start do not coincide"));
20060     //AddChromosomeNoLocation(expected_errors, entry);
20061     eval = validator.Validate(seh, options);
20062     CheckErrors (*eval, expected_errors);
20063 
20064     CLEAR_ERRORS
20065 }
20066 
20067 // note - GraphStopPhase exercised in Test_SEQ_GRAPH_GraphSeqLitLen
20068 
20069 
BOOST_FIXTURE_TEST_CASE(Test_SEQ_GRAPH_GraphDiffNumber,CGenBankFixture)20070 BOOST_FIXTURE_TEST_CASE(Test_SEQ_GRAPH_GraphDiffNumber, CGenBankFixture)
20071 {
20072     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
20073 
20074     CRef<CSeq_annot> annot(new CSeq_annot());
20075     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 6));
20076     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 6, 6));
20077     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 22, 12));
20078     entry->SetSeq().SetAnnot().push_back(annot);
20079 
20080     STANDARD_SETUP
20081 
20082     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphSeqLitLen",
20083                               "SeqGraph (6) and SeqLit (12) length mismatch"));
20084     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphStopPhase",
20085                               "SeqGraph (5) and SeqLit (11) stop do not coincide"));
20086     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphSeqLitLen",
20087                               "SeqGraph (6) and SeqLit (12) length mismatch"));
20088     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphStartPhase",
20089                               "SeqGraph (6) and SeqLit (22) start do not coincide"));
20090     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphStopPhase",
20091                               "SeqGraph (11) and SeqLit (33) stop do not coincide"));
20092     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphDiffNumber",
20093                               "Different number of SeqGraph (3) and SeqLit (2) components"));
20094     //AddChromosomeNoLocation(expected_errors, entry);
20095     eval = validator.Validate(seh, options);
20096     CheckErrors (*eval, expected_errors);
20097 
20098     CLEAR_ERRORS
20099 }
20100 
20101 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphACGTScore)20102 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphACGTScore)
20103 {
20104     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
20105     CRef<CSeq_annot> annot(new CSeq_annot());
20106     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 12));
20107     CRef<CSeq_graph> graph = BuildGoodByteGraph(entry, 22, 12);
20108     graph->SetGraph().SetByte().SetValues().pop_back();
20109     graph->SetGraph().SetByte().SetValues().push_back(0);
20110     graph->SetGraph().SetByte().SetMin(0);
20111     annot->SetData().SetGraph().push_back(graph);
20112 
20113     entry->SetSeq().SetAnnot().push_back(annot);
20114 
20115     STANDARD_SETUP
20116 
20117     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphACGTScore",
20118                               "1 ACGT bases have zero score value - first one at position 34"));
20119     //AddChromosomeNoLocation(expected_errors, entry);
20120     eval = validator.Validate(seh, options);
20121     CheckErrors (*eval, expected_errors);
20122 
20123     CLEAR_ERRORS
20124 }
20125 
20126 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphNScore)20127 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphNScore)
20128 {
20129     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
20130     entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("CCCATNATGATG");
20131 
20132     CRef<CSeq_annot> annot(new CSeq_annot());
20133     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 12));
20134     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 22, 12));
20135 
20136     entry->SetSeq().SetAnnot().push_back(annot);
20137 
20138     STANDARD_SETUP
20139 
20140     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphNScore",
20141                               "1 N bases have positive score value - first one at position 28"));
20142     //AddChromosomeNoLocation(expected_errors, entry);
20143     eval = validator.Validate(seh, options);
20144     CheckErrors (*eval, expected_errors);
20145 
20146     CLEAR_ERRORS
20147 }
20148 
20149 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphGapScore)20150 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphGapScore)
20151 {
20152     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
20153 
20154     CRef<CSeq_annot> annot(new CSeq_annot());
20155     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 12));
20156     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 12, 10));
20157     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 22, 12));
20158 
20159     entry->SetSeq().SetAnnot().push_back(annot);
20160 
20161     STANDARD_SETUP
20162 
20163     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphGapScore",
20164                               "10 gap bases have positive score value"));
20165     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphSeqLitLen",
20166                               "SeqGraph (10) and SeqLit (12) length mismatch"));
20167     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphStartPhase",
20168                               "SeqGraph (12) and SeqLit (22) start do not coincide"));
20169     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphStopPhase",
20170                               "SeqGraph (21) and SeqLit (33) stop do not coincide"));
20171     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphDiffNumber",
20172                               "Different number of SeqGraph (3) and SeqLit (2) components"));
20173     //AddChromosomeNoLocation(expected_errors, entry);
20174     eval = validator.Validate(seh, options);
20175     CheckErrors (*eval, expected_errors);
20176 
20177     CLEAR_ERRORS
20178 }
20179 
20180 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphOverlap)20181 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphOverlap)
20182 {
20183     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
20184 
20185     CRef<CSeq_annot> annot(new CSeq_annot());
20186     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 31));
20187     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 30, 30));
20188 
20189     entry->SetSeq().SetAnnot().push_back(annot);
20190 
20191     STANDARD_SETUP
20192 
20193     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphOverlap",
20194                               "Graph components overlap, with multiple scores for a single base"));
20195     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphBioseqLen",
20196                               "SeqGraph (61) and Bioseq (60) length mismatch"));
20197     //AddChromosomeNoLocation(expected_errors, entry);
20198     eval = validator.Validate(seh, options);
20199     CheckErrors (*eval, expected_errors);
20200 
20201     CLEAR_ERRORS
20202 }
20203 
20204 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphBioseqId)20205 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphBioseqId)
20206 {
20207     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
20208 
20209     CRef<CSeq_annot> annot(new CSeq_annot());
20210     CRef<CSeq_graph> graph = BuildGoodByteGraph(entry);
20211     graph->SetLoc().SetInt().SetId().SetLocal().SetStr("good2");
20212     annot->SetData().SetGraph().push_back(graph);
20213     entry->SetSeq().SetAnnot().push_back(annot);
20214 
20215     STANDARD_SETUP
20216 
20217     expected_errors.push_back(new CExpectedError("lcl|good2", eDiag_Warning, "GraphBioseqId",
20218                               "Bioseq not found for Graph location good2"));
20219     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "GraphPackagingProblem",
20220                               "There is 1 mispackaged graph in this record."));
20221     //AddChromosomeNoLocation(expected_errors, entry);
20222     eval = validator.Validate(seh, options);
20223     CheckErrors (*eval, expected_errors);
20224 
20225     CLEAR_ERRORS
20226 }
20227 
20228 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphACGTScoreMany)20229 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphACGTScoreMany)
20230 {
20231     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
20232     CRef<CSeq_annot> annot(new CSeq_annot());
20233     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 12));
20234     CRef<CSeq_graph> graph = BuildGoodByteGraph(entry, 22, 12);
20235     graph->SetGraph().SetByte().ResetValues();
20236     for (size_t i = 0; i < graph->GetNumval(); i++) {
20237         graph->SetGraph().SetByte().SetValues().push_back(0);
20238     }
20239     graph->SetGraph().SetByte().SetMin(0);
20240     annot->SetData().SetGraph().push_back(graph);
20241 
20242     entry->SetSeq().SetAnnot().push_back(annot);
20243 
20244     STANDARD_SETUP
20245 
20246     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphACGTScoreMany",
20247                               "12 ACGT bases (50.00%) have zero score value - first one at position 23"));
20248     //AddChromosomeNoLocation(expected_errors, entry);
20249     eval = validator.Validate(seh, options);
20250     CheckErrors (*eval, expected_errors);
20251 
20252     CLEAR_ERRORS
20253 }
20254 
20255 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphNScoreMany)20256 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphNScoreMany)
20257 {
20258     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
20259     entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetSeq_data().SetIupacna().Set("ANNNNNNTGATG");
20260 
20261     CRef<CSeq_annot> annot(new CSeq_annot());
20262     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 0, 12));
20263     annot->SetData().SetGraph().push_back(BuildGoodByteGraph(entry, 22, 12));
20264 
20265     entry->SetSeq().SetAnnot().push_back(annot);
20266 
20267     STANDARD_SETUP
20268 
20269     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphNScoreMany",
20270                               "6 N bases (25.00%) have positive score value - first one at position 24"));
20271     /*
20272     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent5Prime",
20273         "Sequence has more than 5 Ns in the first 10 bases or more than 15 Ns in the first 50 bases"));
20274     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "HighNpercent3Prime",
20275         "Sequence has more than 5 Ns in the last 10 bases or more than 15 Ns in the last 50 bases"));
20276     */
20277     //AddChromosomeNoLocation(expected_errors, entry);
20278 
20279     eval = validator.Validate(seh, options);
20280     CheckErrors (*eval, expected_errors);
20281 
20282     CLEAR_ERRORS
20283 
20284 #if 0
20285 
20286     scope.RemoveTopLevelSeqEntry(seh);
20287     CSeq_literal& first_part = entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLiteral();
20288     first_part.SetSeq_data().SetIupacna().Set("AAAAAAAAAAAAAAAAAAAANNNNNNNNNNNNNNNNNNNNTTTTTTTTTTTTTTTTTTTT");
20289     first_part.SetLength(60);
20290     entry->SetSeq().SetInst().SetLength(82);
20291     entry->SetSeq().ResetAnnot();
20292     CRef<CSeq_graph> bad_graph = BuildGoodByteGraph(entry, 0, 79);
20293     CSeq_graph_Base::C_Graph::TByte& bytes = bad_graph->SetGraph().SetByte();
20294     bytes.ResetValues();
20295     for (size_t pos = 0; pos < 20; pos++) {
20296         bytes.SetValues().push_back(0);
20297     }
20298     for (size_t pos = 20; pos < 40; pos++) {
20299         bytes.SetValues().push_back(114);
20300     }
20301     for (size_t pos = 40; pos < 70; pos++) {
20302         bytes.SetValues().push_back(21);
20303     }
20304     bytes.SetMax(-1);
20305     bytes.SetMin(0);
20306     bytes.SetAxis(5);
20307     CRef<CSeq_annot> annot2(new CSeq_annot());
20308     annot2->SetData().SetGraph().push_back(bad_graph);
20309     entry->SetSeq().SetAnnot().push_back(annot2);
20310 
20311     seh = scope.AddTopLevelSeqEntry(*entry);
20312 
20313     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphBioseqLen", "SeqGraph(79) and Bioseq(72) length mismatch"));
20314     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphMax", "Graph max(-1) out of range"));
20315     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphByteLen", "SeqGraph(79) and ByteStore(70) length mismatch"));
20316     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphACGTScoreMany", "23 ACGT bases(29.11%) have zero score value - first one at position 1"));
20317     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphNScoreMany", "20 N bases(25.32%) have positive score value - first one at position 21"));
20318     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphGapScore", "10 gap bases have positive score value"));
20319     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "GraphAbove", "79 quality scores have values above the reported maximum or 100"));
20320     eval = validator.Validate(seh, options);
20321     CheckErrors(*eval, expected_errors);
20322 
20323     CLEAR_ERRORS
20324 #endif
20325 }
20326 
20327 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphLocInvalid_1)20328 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphLocInvalid_1)
20329 {
20330     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
20331 
20332     CRef<CSeq_annot> annot(new CSeq_annot());
20333     CRef<CSeq_graph> graph = BuildGoodByteGraph(entry);
20334     graph->SetLoc().SetInt().SetTo(61);
20335     annot->SetData().SetGraph().push_back(graph);
20336     entry->SetSeq().SetAnnot().push_back(annot);
20337 
20338     STANDARD_SETUP
20339 
20340     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "GraphLocInvalid",
20341                            "SeqGraph location (lcl|good:1-62) is invalid"));
20342     //AddChromosomeNoLocation(expected_errors, entry);
20343     eval = validator.Validate(seh, options);
20344     CheckErrors (*eval, expected_errors);
20345     CLEAR_ERRORS
20346 }
20347 
20348 
BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphLocInvalid_2)20349 BOOST_AUTO_TEST_CASE(Test_SEQ_GRAPH_GraphLocInvalid_2)
20350 {
20351     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
20352 
20353     CRef<CSeq_annot> annot(new CSeq_annot());
20354     CRef<CSeq_graph> graph = BuildGoodByteGraph(entry);
20355     graph->ResetLoc();
20356     annot->SetData().SetGraph().push_back(graph);
20357     entry->SetSeq().SetAnnot().push_back(annot);
20358 
20359     STANDARD_SETUP
20360 
20361     expected_errors.push_back(new CExpectedError("", eDiag_Error, "GraphLocInvalid",
20362                            "SeqGraph location (Unknown) is invalid"));
20363     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "GraphPackagingProblem",
20364                               "There is 1 mispackaged graph in this record."));
20365     //AddChromosomeNoLocation(expected_errors, entry);
20366     eval = validator.Validate(seh, options);
20367     CheckErrors (*eval, expected_errors);
20368 
20369     CLEAR_ERRORS
20370 }
20371 
20372 
BOOST_AUTO_TEST_CASE(Test_SEQ_ANNOT_AnnotIDs)20373 BOOST_AUTO_TEST_CASE(Test_SEQ_ANNOT_AnnotIDs)
20374 {
20375     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
20376     CRef<CSeq_annot> annot(new CSeq_annot());
20377     annot->SetData().SetIds();
20378     entry->SetSeq().SetAnnot().push_back(annot);
20379 
20380     STANDARD_SETUP
20381 
20382     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "AnnotIDs",
20383                               "Record contains Seq-annot.data.ids"));
20384     //AddChromosomeNoLocation(expected_errors, entry);
20385     eval = validator.Validate(seh, options);
20386     CheckErrors (*eval, expected_errors);
20387 
20388     CLEAR_ERRORS
20389 }
20390 
20391 
BOOST_AUTO_TEST_CASE(Test_SEQ_ANNOT_AnnotLOCs)20392 BOOST_AUTO_TEST_CASE(Test_SEQ_ANNOT_AnnotLOCs)
20393 {
20394     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
20395     CRef<CSeq_annot> annot(new CSeq_annot());
20396     annot->SetData().SetLocs();
20397     entry->SetSeq().SetAnnot().push_back(annot);
20398 
20399     STANDARD_SETUP
20400 
20401     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "AnnotLOCs",
20402                               "Record contains Seq-annot.data.locs"));
20403     //AddChromosomeNoLocation(expected_errors, entry);
20404     eval = validator.Validate(seh, options);
20405     CheckErrors (*eval, expected_errors);
20406 
20407     CLEAR_ERRORS
20408 }
20409 
20410 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_WrongQualOnCDS)20411 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_WrongQualOnCDS)
20412 {
20413     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
20414     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
20415     CRef<CGb_qual> qual(new CGb_qual("gene_synonym", "anything"));
20416     cds->SetQual().push_back(qual);
20417 
20418     STANDARD_SETUP
20419 
20420     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "WrongQualOnCDS",
20421                               "gene_synonym should not be a gbqual on a CDS feature"));
20422     //AddChromosomeNoLocation(expected_errors, entry);
20423     eval = validator.Validate(seh, options);
20424     CheckErrors (*eval, expected_errors);
20425 
20426     CLEAR_ERRORS
20427 }
20428 
20429 
BOOST_AUTO_TEST_CASE(Test_FixLatLonFormat)20430 BOOST_AUTO_TEST_CASE(Test_FixLatLonFormat)
20431 {
20432     string to_fix;
20433     string fixed;
20434 
20435 
20436     bool format_correct;
20437     bool precision_correct;
20438     bool lat_in_range;
20439     bool lon_in_range;
20440     double lat_value;
20441     double lon_value;
20442 
20443     CSubSource::IsCorrectLatLonFormat ("53.43.20 N 7.43.20 E", format_correct, precision_correct,
20444                                      lat_in_range, lon_in_range,
20445                                      lat_value, lon_value);
20446     BOOST_CHECK(!format_correct);
20447 
20448 }
20449 
20450 
BOOST_AUTO_TEST_CASE(Test_FixLatLonCountry)20451 BOOST_AUTO_TEST_CASE(Test_FixLatLonCountry)
20452 {
20453     string latlon;
20454     string country;
20455     string error;
20456     CSubSource::ELatLonCountryErr errcode;
20457 
20458     latlon = "35 N 80 E";
20459     country = "USA";
20460     error = CSubSource::ValidateLatLonCountry(country, latlon, false, errcode);
20461     BOOST_CHECK_EQUAL(errcode, CSubSource::eLatLonCountryErr_Value);
20462     BOOST_CHECK_EQUAL(error, "Longitude should be set to W (western hemisphere)");
20463     BOOST_CHECK_EQUAL(latlon, "35.00 N 80.00 W");
20464 
20465     latlon = "25 N 47 E";
20466     country = "Madagascar";
20467     error = CSubSource::ValidateLatLonCountry(country, latlon, false, errcode);
20468     BOOST_CHECK_EQUAL(errcode, CSubSource::eLatLonCountryErr_Value);
20469     BOOST_CHECK_EQUAL(error, "Latitude should be set to S (southern hemisphere)");
20470     BOOST_CHECK_EQUAL(latlon, "25.00 S 47.00 E");
20471 
20472     latlon = "15 N 47 E";
20473     country = "Austria";
20474     error = CSubSource::ValidateLatLonCountry(country, latlon, false, errcode);
20475     BOOST_CHECK_EQUAL(errcode, CSubSource::eLatLonCountryErr_Value);
20476     BOOST_CHECK_EQUAL(error, "Latitude and longitude values appear to be exchanged");
20477     BOOST_CHECK_EQUAL(latlon, "47.00 N 15.00 E");
20478 
20479 }
20480 
20481 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ShortExon)20482 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ShortExon)
20483 {
20484     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet ();
20485     CRef<CSeq_entry> nseq = entry->SetSet().SetSeq_set().front();
20486     CRef<CSeq_entry> pseq = entry->SetSet().SetSeq_set().back();
20487     CRef<CSeq_feat>  cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
20488     CRef<CSeq_feat>  prot = pseq->SetSeq().SetAnnot().front()->SetData().SetFtable().front();
20489 
20490     string start = "ATG";
20491     string stop = "TAA";
20492     string splice_left = "GT";
20493     string splice_right = "AG";
20494     string fifteen = "CCCAGAAAAACAGGT";
20495 
20496     string first_exon = start + fifteen;
20497     string intron = splice_left + fifteen + splice_right;
20498     string second_exon = fifteen;
20499     string third_exon = fifteen + stop;
20500 
20501     string nuc_str = first_exon + intron + second_exon + intron + third_exon;
20502     nseq->SetSeq().SetInst().SetSeq_data().SetIupacna().Set(nuc_str);
20503     nseq->SetSeq().SetInst().SetLength(nuc_str.length());
20504 
20505     CRef<CSeq_loc> loc1(new CSeq_loc());
20506     loc1->SetInt().SetId().SetLocal().SetStr("nuc");
20507     loc1->SetInt().SetFrom(0);
20508     TSeqPos offset = first_exon.length();
20509     loc1->SetInt().SetTo(offset - 1);
20510 
20511     offset += intron.length();
20512     CRef<CSeq_loc> loc2(new CSeq_loc());
20513     loc2->SetInt().SetId().SetLocal().SetStr("nuc");
20514     loc2->SetInt().SetFrom(offset);
20515     offset += second_exon.length();
20516     loc2->SetInt().SetTo(offset - 1);
20517 
20518 
20519     offset += intron.length();
20520     CRef<CSeq_loc> loc3(new CSeq_loc());
20521     loc3->SetInt().SetId().SetLocal().SetStr("nuc");
20522     loc3->SetInt().SetFrom(offset);
20523     offset += third_exon.length();
20524     loc3->SetInt().SetTo(offset - 1);
20525 
20526     cds->SetLocation().SetMix().Set().push_back(loc1);
20527     cds->SetLocation().SetMix().Set().push_back(loc2);
20528     cds->SetLocation().SetMix().Set().push_back(loc3);
20529 
20530     string loc_str = first_exon + second_exon + third_exon;
20531     string prot_str = "";
20532     CSeqTranslator::Translate(loc_str, prot_str);
20533     if (NStr::EndsWith(prot_str, "*")) {
20534         prot_str = prot_str.substr(0, prot_str.length() - 1);
20535     }
20536     pseq->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set(prot_str);
20537     pseq->SetSeq().SetInst().SetLength(prot_str.length());
20538 
20539     prot->SetLocation().SetInt().SetTo(prot_str.length() - 1);
20540 
20541     STANDARD_SETUP
20542 
20543     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "ShortExon",
20544                               "Internal coding region exon is too short at position 38-52"));
20545     //AddChromosomeNoLocation(expected_errors, entry);
20546     eval = validator.Validate(seh, options);
20547     CheckErrors (*eval, expected_errors);
20548 
20549     CLEAR_ERRORS
20550 }
20551 
20552 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ExtraProteinFeature)20553 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_ExtraProteinFeature)
20554 {
20555     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet ();
20556     CRef<CSeq_entry> pseq = entry->SetSet().SetSeq_set().back();
20557     CRef<CSeq_feat> second_prot = AddProtFeat(pseq);
20558     second_prot->SetData().SetProt().SetName().front() = "different name";
20559     second_prot->SetLocation().SetInt().SetFrom(1);
20560 
20561     STANDARD_SETUP
20562 
20563     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "ExtraProteinFeature",
20564                               "Protein sequence has multiple unprocessed protein features"));
20565     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "ExtraProteinFeature",
20566                               "Protein sequence has multiple unprocessed protein features"));
20567     //AddChromosomeNoLocation(expected_errors, entry);
20568     eval = validator.Validate(seh, options);
20569     CheckErrors (*eval, expected_errors);
20570 
20571     CLEAR_ERRORS
20572 }
20573 
20574 
BOOST_AUTO_TEST_CASE(Test_FixFormatDate)20575 BOOST_AUTO_TEST_CASE(Test_FixFormatDate)
20576 {
20577     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("999"), "");
20578     BOOST_CHECK_EQUAL(CSubSource::GetCollectionDateProblem("999"), "Collection_date format is not in DD-Mmm-YYYY format");
20579 
20580     //ISO dates are fine as they are
20581     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2014-08-10T12:23:30Z"), "2014-08-10T12:23:30Z");
20582     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2014-08-10T12:23Z"), "2014-08-10T12:23Z");
20583     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2014-08-10T12Z"), "2014-08-10T12Z");
20584     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2014-08-10T12+00:00"), "2014-08-10T12+00:00");
20585 
20586     bool bad_format = false;
20587     bool in_future = false;
20588     CSubSource::IsCorrectDateFormat("collection date: Nov-2010 and Dec-2012", bad_format, in_future);
20589     BOOST_CHECK_EQUAL(true, bad_format);
20590     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("collection date: Nov-2010 and Dec-2012"), "");
20591 
20592     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("20-12-2014"), "20-Dec-2014");
20593     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Dec-12-2014"), "12-Dec-2014");
20594     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Sep-11"), "Sep-2011");
20595     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("missing"), "");
20596     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("n/a"), "");
20597     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("10-Apr-93"), "10-Apr-1993");
20598     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("1-Apr"), "Apr-2001");
20599     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("#Date"), "");
20600     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("05122011"), "");
20601     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("08-Mar"), "Mar-2008");
20602     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("08022011"), "");
20603     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("1-May"), "May-2001");
20604     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("10-Apr"), "Apr-2010");
20605     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("10-Dec"), "Dec-2010");
20606     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("10-May"), "May-2010");
20607     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("10-Nov"), "Nov-2010");
20608     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("10022011"), "");
20609     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("10082010"), "");
20610     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("1082009"), "");
20611     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("11-Sep"), "Sep-2011");
20612     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("12-Apr"), "Apr-2012");
20613     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("12-Aug"), "Aug-2012");
20614     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("12-Dec"), "Dec-2012");
20615     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("12-Feb"), "Feb-2012");
20616     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("12-Jun"), "Jun-2012");
20617     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("12-Nov"), "Nov-2012");
20618     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("12-Oct"), "Oct-2012");
20619     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("13072010"), "");
20620     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("14-Apr-97"), "14-Apr-1997");
20621     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("14092010"), "");
20622     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("14122011"), "");
20623     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("15/05/98"), "15-May-1998");
20624     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("15072010"), "");
20625     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("15082010"), "");
20626     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("17-Mar-96"), "17-Mar-1996");
20627     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("17062011"), "");
20628     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("19-Jul-99"), "19-Jul-1999");
20629     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("19-Sep-97"), "19-Sep-1997");
20630     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("19012012"), "");
20631     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2-Aug"), "Aug-2002");
20632     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2-Jan-98"), "02-Jan-1998");
20633     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("20-Jun-91"), "20-Jun-1991");
20634     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("20009-04-14"), "");
20635     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("20072010"), "");
20636     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("20082010"), "");
20637     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("20090415"), "");
20638     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("202008-01-26"), "");
20639     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("202008-01-27"), "");
20640     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("202008-08-25"), "");
20641     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("21-Mar-96"), "21-Mar-1996");
20642     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2209"), "");
20643     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("23-Oct-94"), "23-Oct-1994");
20644     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("25-Apr-20010"), "");
20645     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("25-Jun-99"), "25-Jun-1999");
20646     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("25012012"), "");
20647     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("26-Apr-20010"), "");
20648     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("26-Feb-51"), "26-Feb-1951");
20649     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("27072010"), "");
20650     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("29-Apr-20010"), "");
20651     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("29-May-98"), "29-May-1998");
20652     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("29-Sep-94"), "29-Sep-1994");
20653     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("3-Jan"), "Jan-2003");
20654     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("3-Mar-93"), "03-Mar-1993");
20655     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("3082010"), "");
20656     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("31082010"), "");
20657     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39259"), "");
20658     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39517"), "");
20659     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39681"), "");
20660     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39762"), "");
20661     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39846"), "");
20662     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39855"), "");
20663     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39873"), "");
20664     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39898"), "");
20665     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39903"), "");
20666     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39910"), "");
20667     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39917"), "");
20668     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39926"), "");
20669     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39980"), "");
20670     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("39982"), "");
20671     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("4-Feb"), "Feb-2004");
20672     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40010"), "");
20673     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40035"), "");
20674     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40057"), "");
20675     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40070"), "");
20676     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40087"), "");
20677     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40093"), "");
20678     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40313"), "");
20679     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40359"), "");
20680     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40360"), "");
20681     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40361"), "");
20682     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40367"), "");
20683     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40368"), "");
20684     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40370"), "");
20685     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40379"), "");
20686     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40428"), "");
20687     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("40995"), "");
20688     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Oct-20006"), "");
20689     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Sep"), "Sep-2006");
20690     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("7-Dec"), "Dec-2007");
20691     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("8-Jul"), "Jul-2008");
20692     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("8-Sep-99"), "08-Sep-1999");
20693     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("9-Jul"), "Jul-2009");
20694     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("9-Jul-84"), "09-Jul-1984");
20695     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("9-Jun"), "Jun-2009");
20696     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("9-Sep"), "Sep-2009");
20697     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Apr-01"), "Apr-2001");
20698     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Apr-10"), "Apr-2010");
20699     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Aug-05"), "Aug-2005");
20700     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Aug-08"), "Aug-2008");
20701     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Aug-12"), "Aug-2012");
20702     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("August 13"), "Aug-2013");
20703     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("August 27"), "Aug-1927");
20704     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Dec-05"), "Dec-2005");
20705     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Dec-12"), "Dec-2012");
20706     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Dec-98"), "Dec-1998");
20707     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Feb-12"), "Feb-2012");
20708     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Feb-13"), "Feb-2013");
20709     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jan-06"), "Jan-2006");
20710     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jan-13"), "Jan-2013");
20711     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jan-96"), "Jan-1996");
20712     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jul-04"), "Jul-2004");
20713     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jul-08"), "Jul-2008");
20714     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("July 23"), "Jul-1923");
20715     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("July 9"), "Jul-2009");
20716     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jun-05"), "Jun-2005");
20717     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jun-08"), "Jun-2008");
20718     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jun-10"), "Jun-2010");
20719     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jun-12"), "Jun-2012");
20720     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jun-67"), "Jun-1967");
20721     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jun-80"), "Jun-1980");
20722     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("June 11"), "Jun-2011");
20723     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("June 25"), "Jun-1925");
20724     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Mar-02"), "Mar-2002");
20725     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Mar-05"), "Mar-2005");
20726     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Mar-09"), "Mar-2009");
20727     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Mar-10"), "Mar-2010");
20728     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Mar-11"), "Mar-2011");
20729     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Mar-12"), "Mar-2012");
20730     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("May 21"), "May-2021");
20731     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("May 7"), "May-2007");
20732     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("May-05"), "May-2005");
20733     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("May-08"), "May-2008");
20734     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("May-09"), "May-2009");
20735     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("May-10"), "May-2010");
20736     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("May-11"), "May-2011");
20737     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Nov-10"), "Nov-2010");
20738     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Nov-11"), "Nov-2011");
20739     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Oct-05"), "Oct-2005");
20740     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Oct-10"), "Oct-2010");
20741     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("October 8"), "Oct-2008");
20742     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Sep-05"), "Sep-2005");
20743     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Sep-08"), "Sep-2008");
20744     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Sep-09"), "Sep-2009");
20745     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Sep-12"), "Sep-2012");
20746     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Sep-93"), "Sep-1993");
20747     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("September 10"), "Sep-2010");
20748     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("September 24"), "Sep-1924");
20749     // fix leading/trailing spaces
20750     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat(" 2010-03-01"), "2010-03-01");
20751 
20752     // ISO Format dates are not ambiguous
20753     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2010-03-01"), "2010-03-01");
20754 
20755     // if one token is NOT zero-padded and less than 10, and the other is either
20756     // 10 or more or IS zero-padded, then the token that is not padded and less
20757     // than 10 is the day, and the other is the year, to which we should add 2000
20758     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Apr-04"), "06-Apr-2004");
20759     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Aug-09"), "06-Aug-2009");
20760     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Feb-08"), "06-Feb-2008");
20761     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Jan-11"), "06-Jan-2011");
20762     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Jun-11"), "06-Jun-2011");
20763     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Jun-12"), "06-Jun-2012");
20764     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-May-03"), "06-May-2003");
20765     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Nov-08"), "06-Nov-2008");
20766     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6-Oct-09"), "06-Oct-2009");
20767 
20768 
20769     // check for days not in month
20770     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("31-Jun-2013"), "");
20771     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("29-Feb-2013"), "");
20772     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("29-Feb-2012"), "29-Feb-2012");
20773     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("29-Feb-2000"), "29-Feb-2000");
20774     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("29-Feb-1900"), "");
20775     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("01/01/1900"), "01-Jan-1900");
20776     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("04/04/2013"), "04-Apr-2013");
20777     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("11/11/2003"), "11-Nov-2003");
20778 
20779     // look for "named numbers"
20780     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("6th July 2010"), "06-Jul-2010");
20781     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("February 24th, 2012"), "24-Feb-2012");
20782     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("1st December 2012"), "01-Dec-2012");
20783     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("2nd December 2012"), "02-Dec-2012");
20784     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("3rd December 2012"), "03-Dec-2012");
20785 
20786     // unusual delimiters
20787     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("July-15_2011"), "15-Jul-2011");
20788     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("03-Aug=2011"), "03-Aug-2011");
20789     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("Jul=2010"), "Jul-2010");
20790     BOOST_CHECK_EQUAL(CSubSource::FixDateFormat("30.12.1998"), "30-Dec-1998");
20791 
20792 }
20793 
20794 
BOOST_AUTO_TEST_CASE(Test_DetectDateFormat)20795 BOOST_AUTO_TEST_CASE(Test_DetectDateFormat)
20796 {
20797     bool ambiguous;
20798     bool day_first;
20799 
20800     CSubSource::DetectDateFormat("1-1-2010", ambiguous, day_first);
20801     BOOST_CHECK_EQUAL(ambiguous, true);
20802 
20803     CSubSource::DetectDateFormat("1-6-2010", ambiguous, day_first);
20804     BOOST_CHECK_EQUAL(ambiguous, true);
20805 
20806     CSubSource::DetectDateFormat("7-15-2010", ambiguous, day_first);
20807     BOOST_CHECK_EQUAL(ambiguous, false);
20808     BOOST_CHECK_EQUAL(day_first, false);
20809 
20810     CSubSource::DetectDateFormat("2010 8 24", ambiguous, day_first);
20811     BOOST_CHECK_EQUAL(ambiguous, false);
20812     BOOST_CHECK_EQUAL(day_first, false);
20813 
20814     CSubSource::DetectDateFormat("31-5-2008", ambiguous, day_first);
20815     BOOST_CHECK_EQUAL(ambiguous, false);
20816     BOOST_CHECK_EQUAL(day_first, true);
20817 
20818 }
20819 
20820 
s_USAStateTest(string before,string after,CCountries::EStateCleanup expected)20821 static void s_USAStateTest(string before, string after, CCountries::EStateCleanup expected)
20822 {
20823     CCountries::EStateCleanup type = CCountries::e_NoResult;
20824     string result = CCountries::USAStateCleanup(before, type);
20825     BOOST_CHECK_EQUAL(result, after);
20826     BOOST_CHECK_EQUAL((int) type, (int) expected);
20827 }
20828 
20829 
BOOST_AUTO_TEST_CASE(Test_USAStateCleanup)20830 BOOST_AUTO_TEST_CASE(Test_USAStateCleanup)
20831 {
20832     s_USAStateTest("Puerto Rico: San Juan", "USA: Puerto Rico, San Juan", CCountries::e_Corrected );
20833     s_USAStateTest("USA: Puerto Rico", "USA: Puerto Rico", CCountries::e_Valid );
20834     s_USAStateTest("USA: Puerto Rico, Florida", "USA: Puerto Rico, Florida", CCountries::e_Ambiguous );
20835     s_USAStateTest("USA: Florida, Puerto Rico", "USA: Florida, Puerto Rico", CCountries::e_Ambiguous );
20836 
20837     s_USAStateTest("USA: Bethesda, State Of maryland", "USA: Maryland, Bethesda", CCountries::e_Corrected );
20838     s_USAStateTest("USA:NY", "USA: New York", CCountries::e_Corrected );
20839     s_USAStateTest("USA: Delaware, county South carolina", "USA: Delaware, county South carolina", CCountries::e_Valid );
20840     s_USAStateTest("USA:LA, EastBatonRougeParish", "USA: Louisiana, East Baton Rouge Parish", CCountries::e_Corrected );
20841     s_USAStateTest("USA: DeSoto Parish, Louisiana", "USA: Louisiana, DeSoto Parish", CCountries::e_Corrected );
20842     s_USAStateTest("USA: Napa, Solano, Yolo, Marin Counties, CA", "USA: California, Napa, Solano, Yolo, Marin Counties", CCountries::e_Corrected );
20843     s_USAStateTest("USA: Montana, Maine", "USA: Montana, Maine", CCountries::e_Ambiguous );
20844     s_USAStateTest("USA: San  Diego  County, CA", "USA: California, San Diego County", CCountries::e_Corrected );
20845     s_USAStateTest("USA: Madison", "USA: Madison", CCountries::e_Missing );
20846     s_USAStateTest("USA", "USA", CCountries::e_Valid );
20847 
20848     s_USAStateTest("USA: Arkansas, Washington", "USA: Arkansas, Washington", CCountries::e_Ambiguous );
20849     s_USAStateTest("USA: Washington, Arkansas", "USA: Washington, Arkansas", CCountries::e_Ambiguous );
20850     s_USAStateTest("USA: AR, Washington", "USA: Arkansas, Washington", CCountries::e_Ambiguous );
20851     s_USAStateTest("USA: Washington, AR", "USA: Washington, Arkansas", CCountries::e_Ambiguous );
20852     s_USAStateTest("USA: Wisconsin, Oregon", "USA: Wisconsin, Oregon", CCountries::e_Ambiguous );
20853 
20854     s_USAStateTest("Puerto Rico: San Juan", "USA: Puerto Rico, San Juan", CCountries::e_Corrected );
20855     s_USAStateTest("USA: Puerto Rico", "USA: Puerto Rico", CCountries::e_Valid );
20856     s_USAStateTest("USA: Puerto Rico, Florida", "USA: Puerto Rico, Florida", CCountries::e_Ambiguous );
20857     s_USAStateTest("USA: Florida, Puerto Rico", "USA: Florida, Puerto Rico", CCountries::e_Ambiguous );
20858 
20859     s_USAStateTest("USA:Los Angeles", "USA: Los Angeles", CCountries::e_Missing );
20860     s_USAStateTest("USA:Hayward", "USA: Hayward", CCountries::e_Missing );
20861 
20862     CCountries::TUsaExceptionMap exm;
20863     exm["USA: Washington, Arkansas"] = "USA: Arkansas, Washington";
20864     // self-entry is needed for converting e_Ambiguous to e_Valid (from full name) or e_Corrected (from abbreviation)
20865     exm["USA: Arkansas, Washington"] = "USA: Arkansas, Washington";
20866     exm["USA: Puerto Rico, Florida"] = "USA: Puerto Rico, Florida";
20867     exm["USA: Florida, Puerto Rico"] = "USA: Puerto Rico, Florida";
20868     exm["USA: Los Angeles"] = "USA: California, Los Angeles";
20869     exm["USA:Hayward"] = "USA: California, Hayward";
20870     // exm["USA:PR"] = "USA: Puerto Rico";
20871     // exm["USA:GU"] = "USA: Guam";
20872     // exm["USA:VI"] = "USA: US Virgin Islands";
20873     // exm["USA:AS"] = "USA: American Samoa";
20874     CCountries::LoadUSAExceptionMap (exm);
20875 
20876     s_USAStateTest("USA: Arkansas, Washington", "USA: Arkansas, Washington", CCountries::e_Valid );
20877     s_USAStateTest("USA: Washington, Arkansas", "USA: Arkansas, Washington", CCountries::e_Corrected );
20878     s_USAStateTest("USA: AR, Washington", "USA: Arkansas, Washington", CCountries::e_Corrected );
20879     s_USAStateTest("USA: Washington, AR", "USA: Arkansas, Washington", CCountries::e_Corrected );
20880     s_USAStateTest("USA: Wisconsin, Oregon", "USA: Wisconsin, Oregon", CCountries::e_Ambiguous );
20881 
20882     s_USAStateTest("Puerto Rico: San Juan", "USA: Puerto Rico, San Juan", CCountries::e_Corrected );
20883     s_USAStateTest("USA: Puerto Rico", "USA: Puerto Rico", CCountries::e_Valid );
20884     s_USAStateTest("USA: Puerto Rico, Florida", "USA: Puerto Rico, Florida", CCountries::e_Valid );
20885     s_USAStateTest("USA: Florida, Puerto Rico", "USA: Puerto Rico, Florida", CCountries::e_Corrected );
20886 
20887     s_USAStateTest("USA:Los Angeles", "USA: California, Los Angeles", CCountries::e_Corrected );
20888     s_USAStateTest("USA:Hayward", "USA: California, Hayward", CCountries::e_Corrected );
20889     s_USAStateTest("USA: Springfield", "USA: Springfield", CCountries::e_Missing );
20890 
20891     s_USAStateTest("USA:GU", "USA: Guam", CCountries::e_Corrected );
20892     s_USAStateTest("Belize", "Belize", CCountries::e_NotUSA );
20893     s_USAStateTest("France: Paris", "France: Paris", CCountries::e_NotUSA );
20894 }
20895 
20896 
BOOST_AUTO_TEST_CASE(Test_NewFixCountry)20897 BOOST_AUTO_TEST_CASE(Test_NewFixCountry)
20898 {
20899     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Russia, Tatarstan, Kazan"), "Russia: Tatarstan, Kazan");
20900     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Egypt: Red Sea, Ras Mohamed, Sinai"), "Egypt: Red Sea, Ras Mohamed, Sinai");
20901     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Kenya."), "Kenya");
20902     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("U.S.A."), "USA");
20903     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("\"United Kingdom: Scotland, Edinburgh\""), "United Kingdom: Scotland, Edinburgh");
20904     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("1896"), "");
20905     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Anderson, Mesa Verde, Colorado"), "");
20906     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Ansirabe"), "");
20907     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Antarctic Territory Claimed by Australia"), "");
20908     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Ari Ksatr"), "");
20909     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Australia: south-western australia"), "Australia: south-western australia");
20910     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Auwahi, Maui"), "");
20911     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Between Liberia and Ivory Coast"), "");
20912     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Caroline Island, Leticia"), "");
20913     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Catalina Island, California"), "");
20914     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Chia-i"), "");
20915     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Congo"), "");
20916     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Cousin Island"), "");
20917     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Czechoslovakia"), "Czechoslovakia");
20918     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("DE"), "");
20919     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("France: North East France Nievre-Morvan Breuil Chenue forest"), "France: North East France Nievre-Morvan Breuil Chenue forest");
20920     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Great Britain"), "United Kingdom: Great Britain");
20921     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Greenland: Saqqaq Culture site Qeqertasussuk, north-western Greenland"), "Greenland: Saqqaq Culture site Qeqertasussuk, north-western Greenland");
20922     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Guadaloupe Island"), "");
20923     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Hawaii"), "USA: Hawaii");
20924     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Hamoa Bay, Maui, Hawaii, USA"), "USA: Hamoa Bay, Maui, Hawaii");
20925     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Hortus Leiden, the Netherlands"), "Netherlands: Hortus Leiden");
20926     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Hortus, Leiden, the Netherlands"), "Netherlands: Hortus, Leiden");
20927     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Joffreville"), "");
20928     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Korea"), "Korea");
20929     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Kuala Belalong, Ulu Temburong National Park"), "");
20930     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Lake Fryxell"), "");
20931     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Luxemburg"), "Luxembourg");
20932     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Mediterranean Sea, Spain"), "");
20933     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Mexico. Loreto Bay, Gulf of California."), "Mexico: Loreto Bay, Gulf of California");
20934     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Meyendel, the Netherlands"), "Netherlands: Meyendel");
20935     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Mount St. Helena, California"), "");
20936     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Nanyuki"), "");
20937     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Netherland"), "Netherlands");
20938     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("New Guinea"), "Papua New Guinea");
20939     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("North Sea, Netherlands"), "");
20940     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Noumea"), "");
20941     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Roosendaal, De Moeren, the Netherlands"), "Netherlands: Roosendaal, De Moeren");
20942     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("SPAIN (orig)"), "Spain: (orig)");
20943     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("San Tome and Principe Island (1998)"), "Sao Tome and Principe: (1998)");
20944     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Scotland"), "United Kingdom: Scotland");
20945     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("USA (orig)"), "USA: (orig)");
20946     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("USA: Boqueron National Wildlife Refuge, Puerto Rico"), "USA: Boqueron National Wildlife Refuge, Puerto Rico");
20947     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("USA: hypersaline sediment collected at Bitter Lake, New Mexico"), "USA: hypersaline sediment collected at Bitter Lake, New Mexico");
20948     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Wales"), "United Kingdom: Wales");
20949     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("West Germany"), "Germany: West Germany");
20950     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("West Lobe Bonney"), "");
20951     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Wissenkerke, Keihoogteweg, the Netherlands"), "Netherlands: Wissenkerke, Keihoogteweg");
20952     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Wolfskill Orchand, Winters, California"), "");
20953     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Yun Shui"), "");
20954     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("USSR: Kazakhstan, Kurtu"), "USSR: Kazakhstan, Kurtu");
20955     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("USA:"), "USA");
20956     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("south sudan"), "South Sudan");
20957     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("UK: Whiteford Burrows, Gower, Wales"), "United Kingdom: Whiteford Burrows, Gower, Wales");
20958     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Whiteford Burrows, Gower, Wales"), "United Kingdom: Wales, Whiteford Burrows, Gower");
20959     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Burma"), "Myanmar");
20960     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Siam"), "Thailand");
20961     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("AA:BB:CC"), "");
20962     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("AA:BB:Southern China"), "");
20963     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("UK: Whiteford Burrows: Gower: Wales"), "United Kingdom: Whiteford Burrows, Gower, Wales");
20964     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("USA:DE:Dover"), "USA:DE,Dover");
20965     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Burma:A:B"), "Burma:A,B");
20966     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Puerto Rico"), "Puerto Rico");
20967     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Puerto Rico", true), "USA: Puerto Rico");
20968     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Puerto Rico, San Juan", true), "USA: Puerto Rico, San Juan");
20969     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Guam", true), "USA: Guam");
20970     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("American Samoa", true), "USA: American Samoa");
20971     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Virgin Islands", true), "USA: US Virgin Islands");
20972     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("puerto rico"), "Puerto Rico");
20973     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("puerto rico", true), "USA: Puerto Rico");
20974     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("guam"), "Guam");
20975     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("guam", true), "USA: Guam");
20976     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("United States: Georgia"), "USA: Georgia");
20977     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("United States: Georgia", true), "USA: Georgia");
20978     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("United States: Guam"), "USA: Guam");
20979     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("United States: Guam", true), "USA: Guam");
20980     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Georgia"), "Georgia");
20981     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Georgia", true), "Georgia");
20982     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Georgia: Tbilisi"), "Georgia: Tbilisi");
20983     BOOST_CHECK_EQUAL(CCountries::NewFixCountry("Georgia: Tbilisi", true), "Georgia: Tbilisi");
20984 }
20985 
20986 
BOOST_AUTO_TEST_CASE(Fix_Structured_Voucher)20987 BOOST_AUTO_TEST_CASE(Fix_Structured_Voucher)
20988 {
20989     //removed while issues with updating list are sorted out
20990     string val = "USNM<USA>:12345";
20991     COrgMod::FixStructuredVoucher(val, "s");
20992     BOOST_CHECK_EQUAL(val, "USNM<USA>:12345");
20993 
20994     // can't fix, needs country code
20995     val = "ABS<CHN>:12345";
20996     COrgMod::FixStructuredVoucher(val, "s");
20997     BOOST_CHECK_EQUAL(val, "ABS<CHN>:12345");
20998 
20999     // removed while structure-fixing questions are considered
21000     // add structure when space instead of colon
21001     val = "AMNH 12345";
21002     BOOST_CHECK_EQUAL(COrgMod::FixStructuredVoucher(val, "s"), true);
21003     BOOST_CHECK_EQUAL(val, "AMNH:12345");
21004 
21005     // add structure when letters and numbers
21006     val = "ABB666";
21007     BOOST_CHECK_EQUAL(COrgMod::FixStructuredVoucher(val, "c"), true);
21008     BOOST_CHECK_EQUAL(val, "ABB:666");
21009 
21010     // can also fix biomaterial
21011     val = "CNWGRGL123";
21012     BOOST_CHECK_EQUAL(COrgMod::FixStructuredVoucher(val, "b"), true);
21013     BOOST_CHECK_EQUAL(val, "CNWGRGL:123");
21014 
21015     // will not fix for too short code
21016     val = "A12345";
21017     BOOST_CHECK_EQUAL(COrgMod::FixStructuredVoucher(val, "s"), false);
21018     BOOST_CHECK_EQUAL(val, "A12345");
21019 
21020 
21021     // if institution code in parentheses at end of unstructured value, reorder
21022     // GB-6454
21023     val = "M.Riewe 182 (CAS)";
21024     BOOST_CHECK_EQUAL(COrgMod::FixStructuredVoucher(val, "s"), true);
21025     BOOST_CHECK_EQUAL(val, "CAS:M.Riewe 182");
21026 
21027     // don't fix if value in parentheses is not an institution code
21028     val = "L.R. Xu 0081 (WUG)";
21029     BOOST_CHECK_EQUAL(COrgMod::FixStructuredVoucher(val, "s"), false);
21030     BOOST_CHECK_EQUAL(val, "L.R. Xu 0081 (WUG)");
21031 
21032 
21033 }
21034 
21035 
BOOST_AUTO_TEST_CASE(Test_CheckEnds)21036 BOOST_AUTO_TEST_CASE(Test_CheckEnds)
21037 {
21038     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
21039     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("NNNNNNNNNNAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCCAANNNNNNNNNN");
21040     entry->SetSeq().SetInst().SetLength(62);
21041     CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
21042     CScope scope(*objmgr);
21043     scope.AddDefaults();
21044     CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
21045 
21046     EBioseqEndIsType begin_n = eBioseqEndIsType_None;
21047     EBioseqEndIsType begin_gap = eBioseqEndIsType_None;
21048     EBioseqEndIsType end_n = eBioseqEndIsType_None;
21049     EBioseqEndIsType end_gap = eBioseqEndIsType_None;
21050     bool begin_ambig = false, end_ambig = false;
21051 
21052     CBioseq_Handle bsh = seh.GetSeq();
21053     CheckBioseqEndsForNAndGap(bsh, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
21054     BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_All);
21055     BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_All);
21056     BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_All);
21057     BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_All);
21058     BOOST_CHECK_EQUAL(begin_ambig, true);
21059     BOOST_CHECK_EQUAL(end_ambig, true);
21060 
21061     scope.RemoveTopLevelSeqEntry(seh);
21062     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("NNNNNNNNNAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCCAANNNNNNNNN");
21063     entry->SetSeq().SetInst().SetLength(60);
21064     seh = scope.AddTopLevelSeqEntry(*entry);
21065 
21066     bsh = seh.GetSeq();
21067     CheckBioseqEndsForNAndGap(bsh, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
21068     BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_Last);
21069     BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_Last);
21070     BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_Last);
21071     BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_Last);
21072     BOOST_CHECK_EQUAL(begin_ambig, true);
21073     BOOST_CHECK_EQUAL(end_ambig, true);
21074 
21075     scope.RemoveTopLevelSeqEntry(seh);
21076     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCCAA");
21077     entry->SetSeq().SetInst().SetLength(42);
21078     seh = scope.AddTopLevelSeqEntry(*entry);
21079 
21080     bsh = seh.GetSeq();
21081     CheckBioseqEndsForNAndGap(bsh, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
21082     BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_None);
21083     BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_None);
21084     BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_None);
21085     BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_None);
21086     BOOST_CHECK_EQUAL(begin_ambig, false);
21087     BOOST_CHECK_EQUAL(end_ambig, false);
21088 
21089     scope.RemoveTopLevelSeqEntry(seh);
21090     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ANANTNNNCAAAATTGGCCAAAATTGGCCAAAANTNNCNCNA");
21091     entry->SetSeq().SetInst().SetLength(42);
21092     seh = scope.AddTopLevelSeqEntry(*entry);
21093 
21094     bsh = seh.GetSeq();
21095     CheckBioseqEndsForNAndGap(bsh, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
21096     BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_None);
21097     BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_None);
21098     BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_None);
21099     BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_None);
21100     BOOST_CHECK_EQUAL(begin_ambig, true);
21101     BOOST_CHECK_EQUAL(end_ambig, true);
21102 
21103     scope.RemoveTopLevelSeqEntry(seh);
21104     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("GTGTGANANTNNNCNNNNNTGGCCAAAATTGGCCAAAANTNNCNCNAGTGTG");
21105     entry->SetSeq().SetInst().SetLength(52);
21106     seh = scope.AddTopLevelSeqEntry(*entry);
21107 
21108     bsh = seh.GetSeq();
21109     CheckBioseqEndsForNAndGap(bsh, begin_n, begin_gap, end_n, end_gap, begin_ambig, end_ambig);
21110     BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_None);
21111     BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_None);
21112     BOOST_CHECK_EQUAL(begin_n, eBioseqEndIsType_None);
21113     BOOST_CHECK_EQUAL(end_n, eBioseqEndIsType_None);
21114     BOOST_CHECK_EQUAL(begin_ambig, true);
21115     BOOST_CHECK_EQUAL(end_ambig, true);
21116 
21117 }
21118 
21119 
BOOST_AUTO_TEST_CASE(Test_SQD_313)21120 BOOST_AUTO_TEST_CASE(Test_SQD_313)
21121 {
21122     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
21123     SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Jatropha cf.");
21124 
21125     STANDARD_SETUP
21126 
21127     //AddChromosomeNoLocation(expected_errors, entry);
21128     eval = validator.Validate(seh, options);
21129     CheckErrors (*eval, expected_errors);
21130 
21131     CLEAR_ERRORS
21132 }
21133 
21134 
BOOST_AUTO_TEST_CASE(Test_SQD_292)21135 BOOST_AUTO_TEST_CASE(Test_SQD_292)
21136 {
21137     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
21138     CRef<CSeqdesc> create_date(new CSeqdesc());
21139     create_date->SetCreate_date().SetStd().SetMonth(6);
21140     create_date->SetCreate_date().SetStd().SetDay(12);
21141     create_date->SetCreate_date().SetStd().SetYear(1998);
21142     entry->SetSet().SetDescr().Set().push_back(create_date);
21143     CRef<CSeqdesc> update_date(new CSeqdesc());
21144     update_date->SetUpdate_date().SetStd().SetMonth(6);
21145     update_date->SetUpdate_date().SetStd().SetDay(11);
21146     update_date->SetUpdate_date().SetStd().SetYear(1998);
21147     entry->SetSet().SetDescr().Set().push_back(update_date);
21148 
21149     CRef<CSeq_entry> nuc = entry->SetSet().SetSeq_set().front();
21150     CRef<CSeq_id> gi_id(new CSeq_id());
21151     gi_id->SetGi(GI_CONST(1322283));
21152     nuc->SetSeq().SetId().push_front(gi_id);
21153     CRef<CSeq_id> accv_id(new CSeq_id("gb|U54469.1"));
21154     nuc->SetSeq().SetId().push_front (accv_id);
21155 
21156     STANDARD_SETUP
21157 
21158     expected_errors.push_back(new CExpectedError("gb|U54469.1|", eDiag_Warning, "InconsistentDates",
21159                               "Inconsistent create_date [Jun 12, 1998] and update_date [Jun 11, 1998]"));
21160     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Warning, "InconsistentDates",
21161                               "Inconsistent create_date [Jun 12, 1998] and update_date [Jun 11, 1998]"));
21162     //AddChromosomeNoLocation(expected_errors, entry);
21163 
21164     eval = validator.Validate(seh, options);
21165     CheckErrors (*eval, expected_errors);
21166 
21167     CLEAR_ERRORS
21168 }
21169 
21170 
BOOST_AUTO_TEST_CASE(Test_SQD_1470)21171 BOOST_AUTO_TEST_CASE(Test_SQD_1470)
21172 {
21173     // prepare entry
21174     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
21175     edit::CGenomeAssemblyComment gac1;
21176     gac1.SetAssemblyMethodProgram("a");
21177     gac1.SetAssemblyMethodVersion("1");
21178     gac1.SetGenomeCoverage("3x");
21179     gac1.SetSequencingTechnology("foo");
21180 
21181     CRef<CSeqdesc> sd1(new CSeqdesc());
21182     sd1->SetUser(*(gac1.MakeUserObject()));
21183     entry->SetSeq().SetDescr().Set().push_back(sd1);
21184 
21185     CRef<CSeqdesc> sd2(new CSeqdesc());
21186     sd2->SetUser(*(gac1.MakeUserObject()));
21187     entry->SetSeq().SetDescr().Set().push_back(sd2);
21188 
21189     STANDARD_SETUP
21190 
21191     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "MultipleStrucComms",
21192                               "Multiple structured comments with prefix ##Genome-Assembly-Data-START##"));
21193     //AddChromosomeNoLocation(expected_errors, entry);
21194 
21195     eval = validator.Validate(seh, options);
21196     CheckErrors (*eval, expected_errors);
21197 
21198     CLEAR_ERRORS
21199 }
21200 
21201 
BOOST_AUTO_TEST_CASE(Test_SQD_1309)21202 BOOST_AUTO_TEST_CASE(Test_SQD_1309)
21203 {
21204     // prepare entry
21205     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
21206     unit_test_util::RevComp(entry);
21207     CRef<CSeq_entry> nentry = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
21208     SetTech(nentry, CMolInfo::eTech_tsa);
21209     unit_test_util::SetBiomol (nentry, CMolInfo::eBiomol_transcribed_RNA);
21210     nentry->SetSeq().SetInst().SetMol(CSeq_inst::eMol_rna);
21211 
21212     STANDARD_SETUP
21213 
21214     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSonMinusStrandTranscribedRNA",
21215                               "Coding region on TSA transcribed RNA should not be on the minus strand"));
21216     //AddChromosomeNoLocation(expected_errors, entry);
21217 
21218     eval = validator.Validate(seh, options);
21219     CheckErrors (*eval, expected_errors);
21220 
21221     CLEAR_ERRORS
21222 
21223     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "CDSonMinusStrandTranscribedRNA",
21224                               "Coding region on TSA transcribed RNA should not be on the minus strand"));
21225     eval = validator.GetTSACDSOnMinusStrandErrors(seh);
21226     CheckErrors (*eval, expected_errors);
21227 
21228     CLEAR_ERRORS
21229 }
21230 
21231 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadCDScomment)21232 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_BadCDScomment)
21233 {
21234     // prepare entry
21235     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
21236     CRef<CSeq_entry> nentry = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
21237 
21238     STANDARD_SETUP
21239     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
21240     cds->SetComment("ambiguity in stop codon");
21241 
21242     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "BadCDScomment",
21243                               "Feature comment indicates ambiguity in stop codon but no ambiguities are present in stop codon."));
21244     //AddChromosomeNoLocation(expected_errors, entry);
21245 
21246     eval = validator.Validate(seh, options);
21247     CheckErrors (*eval, expected_errors);
21248 
21249     CLEAR_ERRORS
21250 
21251     edit::AddTerminalCodeBreak(*cds, seh.GetScope());
21252     scope.RemoveTopLevelSeqEntry(seh);
21253     seh = scope.AddTopLevelSeqEntry(*entry);
21254     //AddChromosomeNoLocation(expected_errors, entry);
21255     eval = validator.Validate(seh, options);
21256     CheckErrors(*eval, expected_errors);
21257 
21258     scope.RemoveTopLevelSeqEntry(seh);
21259     nentry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGAAAAACAGAGATAAACTNAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
21260     seh = scope.AddTopLevelSeqEntry(*entry);
21261 
21262 // Error below is not expected anymore since VR-110 issue fixed:
21263 //
21264 //    expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "UnnecessaryTranslExcept",
21265 //                              "Unexpected transl_except * at position 9 just past end of protein"));
21266 
21267     eval = validator.Validate(seh, options);
21268     CheckErrors (*eval, expected_errors);
21269 
21270     CLEAR_ERRORS
21271 }
21272 
21273 
BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidFuzz)21274 BOOST_AUTO_TEST_CASE(Test_SEQ_FEAT_InvalidFuzz)
21275 {
21276     // prepare entry
21277     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
21278     CRef<CSeq_feat> misc1 = unit_test_util::AddMiscFeature(entry);
21279     misc1->SetLocation().SetInt().SetFuzz_from().SetLim(CInt_fuzz::eLim_tl);
21280     misc1->SetLocation().SetInt().SetFuzz_to().SetLim(CInt_fuzz::eLim_tl);
21281 
21282     CRef<CSeq_feat> misc2 = unit_test_util::AddMiscFeature(entry);
21283     misc2->SetLocation().SetInt().SetFrom(5);
21284     misc2->SetLocation().SetInt().SetFuzz_from().SetLim(CInt_fuzz::eLim_tr);
21285     misc2->SetLocation().SetInt().SetFuzz_to().SetLim(CInt_fuzz::eLim_tr);
21286 
21287     CRef<CSeq_feat> misc3 = unit_test_util::AddMiscFeature(entry);
21288     CRef<CSeq_id> id(new CSeq_id());
21289     id->Assign(*(entry->GetSeq().GetId().front()));
21290     CRef<CSeq_interval> int1(new CSeq_interval(*id, 0, 5));
21291     int1->SetFuzz_from().SetLim(CInt_fuzz::eLim_tl);
21292     int1->SetFuzz_to().SetLim(CInt_fuzz::eLim_tl);
21293     CRef<CSeq_interval> int2(new CSeq_interval(*id, 10, 15));
21294     int2->SetFuzz_from().SetLim(CInt_fuzz::eLim_tr);
21295     int2->SetFuzz_to().SetLim(CInt_fuzz::eLim_tr);
21296 
21297     misc3->SetLocation().SetPacked_int().Set().push_back(int1);
21298     misc3->SetLocation().SetPacked_int().Set().push_back(int2);
21299 
21300     STANDARD_SETUP
21301 
21302     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
21303                                 "Should not specify 'space to left' for both ends of interval"));
21304     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
21305                                 "Should not specify 'space to right' for both ends of interval"));
21306     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
21307         "Should not specify 'space to left' for both ends of interval"));
21308     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
21309         "Should not specify 'space to right' for both ends of interval"));
21310     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
21311         "Should not specify 'space to left' at first position of non-circular sequence"));
21312     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
21313         "Should not specify 'space to left' at first position of non-circular sequence"));
21314     //AddChromosomeNoLocation(expected_errors, entry);
21315 
21316     eval = validator.Validate(seh, options);
21317     CheckErrors(*eval, expected_errors);
21318 
21319     CLEAR_ERRORS
21320 }
21321 
21322 
BOOST_AUTO_TEST_CASE(Test_SQD_1532)21323 BOOST_AUTO_TEST_CASE(Test_SQD_1532)
21324 {
21325     BOOST_CHECK_EQUAL(COrgMod::IsCultureCollectionValid("50% TSB + 2mM Cr(VI)"), "Culture_collection should be structured, but is not");
21326 }
21327 
21328 
BOOST_AUTO_TEST_CASE(Test_SexQualifiers)21329 BOOST_AUTO_TEST_CASE(Test_SexQualifiers)
21330 {
21331     BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("M"), true);
21332     BOOST_CHECK_EQUAL(CSubSource::FixSexQualifierValue("M"), "male");
21333     BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("Male"), true);
21334     BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("male"), true);
21335     BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("llama"), false);
21336     BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("m/f"), true);
21337     BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("pooled males and females"), true);
21338     BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("pooled male and female"), true);
21339     BOOST_CHECK_EQUAL(CSubSource::IsValidSexQualifierValue("mixed"), true);
21340 
21341     BOOST_CHECK_EQUAL(CSubSource::FixSexQualifierValue("m/f"), "male and female");
21342     BOOST_CHECK_EQUAL(CSubSource::FixSexQualifierValue("m/f/neuter"), "male, female, and neuter");
21343     BOOST_CHECK_EQUAL(CSubSource::FixSexQualifierValue("male and female (pooled)"), "pooled male and female");
21344 
21345 }
21346 
21347 
BOOST_AUTO_TEST_CASE(TEST_DisableStrainForwarding)21348 BOOST_AUTO_TEST_CASE(TEST_DisableStrainForwarding)
21349 {
21350     CBioSource src;
21351 
21352     src.SetDisableStrainForwarding(true);
21353     BOOST_CHECK_EQUAL(src.GetOrg().GetOrgname().GetAttrib(), "nomodforward");
21354     BOOST_CHECK_EQUAL(src.GetDisableStrainForwarding(), true);
21355     src.SetDisableStrainForwarding(false);
21356     BOOST_CHECK_EQUAL(src.GetDisableStrainForwarding(), false);
21357 }
21358 
21359 
BOOST_AUTO_TEST_CASE(Test_AllNs)21360 BOOST_AUTO_TEST_CASE(Test_AllNs)
21361 {
21362     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
21363     entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNN");
21364     entry->SetSeq().SetInst().SetLength(30);
21365 
21366     STANDARD_SETUP
21367 
21368     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical, "AllNs",
21369                               "Sequence is all Ns"));
21370     //AddChromosomeNoLocation(expected_errors, entry);
21371 
21372     eval = validator.Validate(seh, options);
21373     CheckErrors (*eval, expected_errors);
21374 
21375     CLEAR_ERRORS
21376 }
21377 
21378 
BOOST_AUTO_TEST_CASE(Test_SubSourceAutofix)21379 BOOST_AUTO_TEST_CASE(Test_SubSourceAutofix)
21380 {
21381     CRef<CSubSource> ss(new CSubSource());
21382 
21383     ss->SetSubtype(CSubSource::eSubtype_country);
21384     ss->SetName("Maryland, USA");
21385     ss->AutoFix();
21386     BOOST_CHECK_EQUAL(ss->GetName(), "USA: Maryland");
21387 
21388     ss->SetSubtype(CSubSource::eSubtype_collection_date);
21389     ss->SetName("1-14-97");
21390     ss->AutoFix();
21391     BOOST_CHECK_EQUAL(ss->GetName(), "14-Jan-1997");
21392 
21393     ss->SetSubtype(CSubSource::eSubtype_lat_lon);
21394     ss->SetName("Lattitude: 25.790544; longitude: -80.214930");
21395     ss->AutoFix();
21396     BOOST_CHECK_EQUAL(ss->GetName(), "25.790544 N 80.214930 W");
21397 
21398     ss->SetSubtype(CSubSource::eSubtype_sex);
21399     ss->SetName("m/f/neuter");
21400     ss->AutoFix();
21401     BOOST_CHECK_EQUAL(ss->GetName(), "male, female, and neuter");
21402 
21403     ss->SetSubtype(CSubSource::eSubtype_altitude);
21404     ss->SetName("123 ft.");
21405     ss->AutoFix();
21406     BOOST_CHECK_EQUAL(ss->GetName(), "37 m");
21407 
21408 }
21409 
21410 
BOOST_AUTO_TEST_CASE(Test_OrgModAutofix)21411 BOOST_AUTO_TEST_CASE(Test_OrgModAutofix)
21412 {
21413     CRef<COrgMod> om(new COrgMod());
21414     om->SetSubtype(COrgMod::eSubtype_strain);
21415     om->SetSubname("ATCC1234");
21416     om->AutoFix();
21417     BOOST_CHECK_EQUAL(om->GetSubname(), "ATCC 1234");
21418     om->SetSubname("DSM  567");
21419     om->AutoFix();
21420     BOOST_CHECK_EQUAL(om->GetSubname(), "DSM 567");
21421 
21422     om->SetSubtype(COrgMod::eSubtype_nat_host);
21423     om->SetSubname("human");
21424     om->AutoFix();
21425     BOOST_CHECK_EQUAL(om->GetSubname(), "Homo sapiens");
21426 }
21427 
21428 
BOOST_AUTO_TEST_CASE(Test_RmCultureNotes)21429 BOOST_AUTO_TEST_CASE(Test_RmCultureNotes)
21430 {
21431     CRef<CSubSource> ss(new CSubSource());
21432     ss->SetSubtype(CSubSource::eSubtype_other);
21433     ss->SetName("a; [mixed bacterial source]; b");
21434     ss->RemoveCultureNotes();
21435     BOOST_CHECK_EQUAL(ss->GetName(), "a; b");
21436     ss->SetName("[uncultured (using species-specific primers) bacterial source]");
21437     ss->RemoveCultureNotes();
21438     BOOST_CHECK_EQUAL(ss->GetName(), "amplified with species-specific primers");
21439     ss->SetName("[BankIt_uncultured16S_wizard]; [universal primers]; [tgge]");
21440     ss->RemoveCultureNotes();
21441     BOOST_CHECK_EQUAL(ss->IsSetName(), false);
21442     ss->SetName("[BankIt_uncultured16S_wizard]; [species_specific primers]; [dgge]");
21443     ss->RemoveCultureNotes();
21444     BOOST_CHECK_EQUAL(ss->GetName(), "amplified with species-specific primers");
21445 
21446     CRef<CBioSource> src(new CBioSource());
21447     ss->SetName("a; [mixed bacterial source]; b");
21448     src->SetSubtype().push_back(ss);
21449     src->RemoveCultureNotes();
21450     BOOST_CHECK_EQUAL(ss->GetName(), "a; b");
21451     ss->SetName("[BankIt_uncultured16S_wizard]; [universal primers]; [tgge]");
21452     src->RemoveCultureNotes();
21453     BOOST_CHECK_EQUAL(src->IsSetSubtype(), false);
21454 }
21455 
21456 
BOOST_AUTO_TEST_CASE(Test_VR_28)21457 BOOST_AUTO_TEST_CASE(Test_VR_28)
21458 {
21459     // prepare entry
21460     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
21461 
21462     STANDARD_SETUP
21463 
21464     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NonViralSegment",
21465             "Non-viral source feature should not have a segment qualifier"));
21466     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NonViralSegment",
21467             "Non-viral source feature should not have a segment qualifier"));
21468 
21469     expected_errors.push_back(new CExpectedError("lcl|good",
21470                                                  eDiag_Warning,
21471                                                  "MultipleSourceQualifiers",
21472                                                  "Multiple segment qualifiers present"));
21473     //AddChromosomeNoLocation(expected_errors, entry);
21474 
21475     // Mutliple segment qualifiers
21476     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_segment, "1");
21477     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_segment, "2");
21478 
21479     eval = validator.Validate(seh, options);
21480     CheckErrors (*eval, expected_errors);
21481     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_segment, "");
21482 
21483     CLEAR_ERRORS
21484 
21485     expected_errors.push_back(new CExpectedError("lcl|good",
21486                                                  eDiag_Warning,
21487                                                  "MultipleSourceQualifiers",
21488                                                  "Multiple segment qualifiers present"));
21489 
21490     // Multiple collected_by qualifiers
21491     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_collected_by, "Michael Hunter");
21492     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_collected_by, "Steven Fisher");
21493     expected_errors[0]->SetErrMsg("Multiple collected_by qualifiers present");
21494 
21495     eval = validator.Validate(seh, options);
21496     CheckErrors (*eval, expected_errors);
21497     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_collected_by, "");
21498 
21499     // Multiple identified_by qualifiers
21500     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_identified_by, "Michael Hunter");
21501     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_identified_by, "Steven Fisher");
21502     expected_errors[0]->SetErrMsg("Multiple identified_by qualifiers present");
21503 
21504     eval = validator.Validate(seh, options);
21505     CheckErrors (*eval, expected_errors);
21506     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_identified_by, "");
21507 
21508     // Multiple collection_date qualifiers
21509     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_collection_date, "31-May-2014");
21510     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_collection_date, "30-Apr-2014");
21511     expected_errors[0]->SetErrMsg("Multiple collection_date qualifiers present");
21512 
21513     eval = validator.Validate(seh, options);
21514     CheckErrors (*eval, expected_errors);
21515     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_collection_date, "");
21516 
21517     CLEAR_ERRORS
21518 }
21519 
21520 
BOOST_AUTO_TEST_CASE(Test_GP_9919)21521 BOOST_AUTO_TEST_CASE(Test_GP_9919)
21522 {
21523     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
21524 
21525     STANDARD_SETUP
21526 
21527     entry->SetSet().SetSeq_set().back()->SetSeq().SetInst().SetSeq_data().SetIupacaa().Set("MP*K*E*N");
21528     entry->SetSet().SetSeq_set().front()->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("GTGCCCTAAAAATAAGAGTAAAACTAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
21529     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
21530     cds->SetExcept(true);
21531     cds->SetExcept_text("unclassified translation discrepancy");
21532 
21533     BOOST_CHECK_EQUAL(validator::HasStopInProtein(*cds, scope), true);
21534     BOOST_CHECK_EQUAL(validator::HasInternalStop(*cds, scope, false), true);
21535 
21536     // list of expected errors
21537     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "StopInProtein", "[3] termination symbols in protein sequence (gene? - fake protein name)"));
21538     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "ExceptionProblem", "unclassified translation discrepancy is not a legal exception explanation"));
21539     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InternalStop", "3 internal stops (and illegal start codon). Genetic code [0]"));
21540     //AddChromosomeNoLocation(expected_errors, entry);
21541 
21542     eval = validator.Validate(seh, options);
21543     CheckErrors (*eval, expected_errors);
21544 
21545     // now suppress an error
21546     CRef<CSeqdesc> suppress(new CSeqdesc());
21547     suppress->SetUser().SetObjectType(CUser_object::eObjectType_ValidationSuppression);
21548     CValidErrorFormat::AddSuppression(suppress->SetUser(), eErr_SEQ_FEAT_InternalStop);
21549     entry->SetSet().SetDescr().Set().push_back(suppress);
21550     delete(expected_errors[2]);
21551     expected_errors[2] = NULL;
21552     eval = validator.Validate(seh, options);
21553     CheckErrors (*eval, expected_errors);
21554 
21555     // suppress two errors
21556     CValidErrorFormat::AddSuppression(suppress->SetUser(), eErr_SEQ_FEAT_ExceptionProblem);
21557     delete(expected_errors[1]);
21558     expected_errors[1] = NULL;
21559     eval = validator.Validate(seh, options);
21560     CheckErrors (*eval, expected_errors);
21561 
21562     // suppress three errors
21563     CValidErrorFormat::AddSuppression(suppress->SetUser(), eErr_SEQ_INST_StopInProtein);
21564     CLEAR_ERRORS
21565     //AddChromosomeNoLocation(expected_errors, entry);
21566     eval = validator.Validate(seh, options);
21567     CheckErrors (*eval, expected_errors);
21568 
21569     CLEAR_ERRORS
21570 }
21571 
BOOST_AUTO_TEST_CASE(Test_RemoveLineageSourceNotes)21572 BOOST_AUTO_TEST_CASE(Test_RemoveLineageSourceNotes)
21573 {
21574     CRef<CBioSource> bsrc(new CBioSource);
21575     bsrc->SetOrg().SetTaxname("Influenza A virus");
21576     bsrc->SetOrg().SetOrgname().SetLineage("Viruses; ssRNA negative-strand viruses; Orthomyxoviridae; Influenzavirus A");
21577 
21578     CRef<CSubSource> subsrc(new CSubSource(CSubSource::eSubtype_other, "Organism: viruses"));
21579     bsrc->SetSubtype().push_back(subsrc);
21580     CRef<COrgMod> mod_a(new COrgMod(COrgMod::eSubtype_strain, "virus strain"));
21581     CRef<COrgMod> mod_b(new COrgMod(COrgMod::eSubtype_other, "note: influenza A"));
21582     bsrc->SetOrg().SetOrgname().SetMod().push_back(mod_a);
21583     bsrc->SetOrg().SetOrgname().SetMod().push_back(mod_b);
21584 
21585     bool removed = bsrc->RemoveLineageSourceNotes();
21586     BOOST_CHECK_EQUAL(removed, false); // it won't remove the notes as there is no taxid
21587     bsrc->SetOrg().SetTaxId(TAX_ID_CONST(11320));
21588 
21589     removed = bsrc->RemoveLineageSourceNotes();
21590     BOOST_CHECK_EQUAL(removed, true);
21591     BOOST_CHECK_EQUAL(bsrc->IsSetSubtype(), false);
21592     FOR_EACH_ORGMOD_ON_BIOSOURCE( orgmod, *bsrc) {
21593         if ((*orgmod)->IsSetSubtype()) {
21594             BOOST_CHECK_EQUAL((*orgmod)->GetSubtype() == COrgMod::eSubtype_other, false);
21595         }
21596     }
21597 
21598     CRef<COrgMod> mod_c(new COrgMod(COrgMod::eSubtype_other, "domain: unknown domain"));
21599     removed = bsrc->RemoveLineageSourceNotes();
21600     BOOST_CHECK_EQUAL(removed, false);
21601     FOR_EACH_ORGMOD_ON_BIOSOURCE( orgmod, *bsrc) {
21602         if ((*orgmod)->IsSetSubtype() && (*orgmod)->GetSubtype() != COrgMod::eSubtype_strain) {
21603             BOOST_CHECK_EQUAL((*orgmod)->GetSubtype() == COrgMod::eSubtype_other, true);
21604         }
21605     }
21606 }
21607 
21608 
BOOST_AUTO_TEST_CASE(Test_GB_3714)21609 BOOST_AUTO_TEST_CASE(Test_GB_3714)
21610 {
21611     string orig = CGb_qual::BuildExperiment("", "experiment", "");
21612     BOOST_CHECK_EQUAL(orig, "experiment");
21613 
21614     string experiment;
21615     string category;
21616     string doi;
21617 
21618     CGb_qual::ParseExperiment(orig, category, experiment, doi);
21619     BOOST_CHECK_EQUAL(category, "");
21620     BOOST_CHECK_EQUAL(experiment, "experiment");
21621     BOOST_CHECK_EQUAL(doi, "");
21622 
21623     orig = CGb_qual::BuildExperiment("", "experiment2", "DOI");
21624     BOOST_CHECK_EQUAL(orig, "experiment2[DOI]");
21625     CGb_qual::ParseExperiment(orig, category, experiment, doi);
21626     BOOST_CHECK_EQUAL(category, "");
21627     BOOST_CHECK_EQUAL(experiment, "experiment2");
21628     BOOST_CHECK_EQUAL(doi, "DOI");
21629 
21630     orig = CGb_qual::BuildExperiment("COORDINATES", "experiment3", "");
21631     BOOST_CHECK_EQUAL(orig, "COORDINATES:experiment3");
21632     CGb_qual::ParseExperiment(orig, category, experiment, doi);
21633     BOOST_CHECK_EQUAL(category, "COORDINATES");
21634     BOOST_CHECK_EQUAL(experiment, "experiment3");
21635     BOOST_CHECK_EQUAL(doi, "");
21636 
21637     orig = CGb_qual::BuildExperiment("EXISTENCE", "experiment4", "DOI2");
21638     BOOST_CHECK_EQUAL(orig, "EXISTENCE:experiment4[DOI2]");
21639     CGb_qual::ParseExperiment(orig, category, experiment, doi);
21640     BOOST_CHECK_EQUAL(category, "EXISTENCE");
21641     BOOST_CHECK_EQUAL(experiment, "experiment4");
21642     BOOST_CHECK_EQUAL(doi, "DOI2");
21643 
21644 }
21645 
21646 
BOOST_AUTO_TEST_CASE(Test_SQD_2036)21647 BOOST_AUTO_TEST_CASE(Test_SQD_2036)
21648 {
21649     string msg = CSubSource::CheckCellLine("222", "Homo sapiens");
21650     BOOST_CHECK_EQUAL(msg, "The International Cell Line Authentication Committee database indicates that 222 from Homo sapiens is known to be contaminated by PA1 from Human. Please see http://iclac.org/databases/cross-contaminations/ for more information and references.");
21651 
21652     msg = CSubSource::CheckCellLine("223", "Homo sapiens");
21653     BOOST_CHECK_EQUAL(msg, "");
21654 
21655     msg = CSubSource::CheckCellLine("222", "Canis familiaris");
21656     BOOST_CHECK_EQUAL(msg, "");
21657 
21658     // prepare entry
21659     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
21660     unit_test_util::SetTaxname(entry, "Cavia porcellus");
21661     unit_test_util::SetTaxon(entry, 0);
21662     unit_test_util::SetTaxon(entry, 10141);
21663     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_cell_line, "GPS-M");
21664 
21665     STANDARD_SETUP
21666 
21667     expected_errors.push_back(new CExpectedError("lcl|good",
21668                                                  eDiag_Warning,
21669                                                  "SuspectedContaminatedCellLine",
21670                                                  "The International Cell Line Authentication Committee database indicates that GPS-M from Cavia porcellus is known to be contaminated by Strain L-M from Mouse. Please see http://iclac.org/databases/cross-contaminations/ for more information and references."));
21671     //AddChromosomeNoLocation(expected_errors, entry);
21672 
21673     eval = validator.Validate(seh, options);
21674     CheckErrors (*eval, expected_errors);
21675     CLEAR_ERRORS
21676 }
21677 
21678 
BOOST_AUTO_TEST_CASE(Test_VR_146)21679 BOOST_AUTO_TEST_CASE(Test_VR_146)
21680 {
21681     CRef<CSeq_entry> e1 = unit_test_util::BuildGoodSeq();
21682     unit_test_util::RemoveDescriptorType (e1, CSeqdesc::e_Pub);
21683     CRef<CSeq_entry> e2 = unit_test_util::BuildGoodNucProtSet();
21684     unit_test_util::RemoveDescriptorType (e2, CSeqdesc::e_Pub);
21685     CRef<CSeq_entry> entry(new CSeq_entry());
21686     entry->SetSet().SetClass(CBioseq_set::eClass_phy_set);
21687     entry->SetSet().SetSeq_set().push_back(e1);
21688     entry->SetSet().SetSeq_set().push_back(e2);
21689 
21690     STANDARD_SETUP
21691 
21692     expected_errors.push_back(new CExpectedError("lcl|good",
21693                                                  eDiag_Error,
21694                                                  "NoPubFound",
21695                                                  "No publications anywhere on this entire record."));
21696     expected_errors.push_back(new CExpectedError("lcl|good",
21697                                                  eDiag_Info,
21698                                                  "MissingPubRequirement",
21699                                                  "No submission citation anywhere on this entire record."));
21700     //AddChromosomeNoLocation(expected_errors, entry);
21701 
21702     eval = validator.Validate(seh, options);
21703     CheckErrors (*eval, expected_errors);
21704     options |= CValidator::eVal_genome_submission;
21705     expected_errors[1]->SetSeverity(eDiag_Error);
21706     eval = validator.Validate(seh, options);
21707     CheckErrors (*eval, expected_errors);
21708 
21709     CLEAR_ERRORS
21710 
21711     unit_test_util::AddGoodPub(entry);
21712     //AddChromosomeNoLocation(expected_errors, entry);
21713     eval = validator.Validate(seh, options);
21714     CheckErrors (*eval, expected_errors);
21715 
21716     CLEAR_ERRORS
21717 }
21718 
21719 
BOOST_AUTO_TEST_CASE(Test_VR_711)21720 BOOST_AUTO_TEST_CASE(Test_VR_711)
21721 {
21722     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
21723     CRef<CSeq_feat> repeat_region = unit_test_util::AddMiscFeature(entry);
21724     repeat_region->SetData().SetImp().SetKey("repeat_region");
21725     repeat_region->ResetComment();
21726     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
21727     misc->ResetComment();
21728 
21729     STANDARD_SETUP
21730 
21731     expected_errors.push_back(new CExpectedError("lcl|good",
21732         eDiag_Warning,
21733         "MiscFeatureNeedsNote",
21734         "A note or other qualifier is required for a misc_feature"));
21735     expected_errors.push_back(new CExpectedError("lcl|good",
21736         eDiag_Warning,
21737         "RepeatRegionNeedsNote",
21738         "repeat_region has no qualifiers"));
21739     //AddChromosomeNoLocation(expected_errors, entry);
21740 
21741     eval = validator.Validate(seh, options);
21742     CheckErrors(*eval, expected_errors);
21743     // bump to error for -U
21744     options |= CValidator::eVal_genome_submission;
21745     expected_errors[0]->SetSeverity(eDiag_Error);
21746     expected_errors[1]->SetSeverity(eDiag_Error);
21747     eval = validator.Validate(seh, options);
21748     CheckErrors(*eval, expected_errors);
21749 
21750     // only warning for EMBL/DDBJ
21751     scope.RemoveTopLevelSeqEntry(seh);
21752     CRef<CSeq_id> other_acc(new CSeq_id());
21753     other_acc->SetEmbl().SetAccession("HE717023");
21754     other_acc->SetEmbl().SetVersion(1);
21755     entry->SetSeq().SetId().push_back(other_acc);
21756     seh = scope.AddTopLevelSeqEntry(*entry);
21757     expected_errors[0]->SetSeverity(eDiag_Warning);
21758     expected_errors[1]->SetSeverity(eDiag_Warning);
21759     ChangeErrorAcc(expected_errors, "emb|HE717023.1|");
21760     eval = validator.Validate(seh, options);
21761     CheckErrors(*eval, expected_errors);
21762 
21763     CLEAR_ERRORS
21764 }
21765 
21766 
BOOST_AUTO_TEST_CASE(Test_IsLocationInFrame)21767 BOOST_AUTO_TEST_CASE(Test_IsLocationInFrame)
21768 {
21769     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
21770     CRef<CSeq_feat> cds = GetCDSFromGoodNucProtSet(entry);
21771 
21772     CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));;
21773     CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry);
21774 
21775     CSeq_feat_Handle fh = scope->GetSeq_featHandle(*cds);
21776     CRef<CSeq_loc> loc(new CSeq_loc());
21777     loc->Assign(cds->GetLocation());
21778 
21779     BOOST_CHECK_EQUAL(feature::eLocationInFrame_InFrame, feature::IsLocationInFrame(fh, *loc));
21780     loc->SetInt().SetFrom(loc->GetInt().GetFrom() + 1);
21781     BOOST_CHECK_EQUAL(feature::eLocationInFrame_BadStart, feature::IsLocationInFrame(fh, *loc));
21782     loc->SetInt().SetFrom(loc->GetInt().GetFrom() + 1);
21783     BOOST_CHECK_EQUAL(feature::eLocationInFrame_BadStart, feature::IsLocationInFrame(fh, *loc));
21784     loc->SetInt().SetFrom(loc->GetInt().GetFrom() + 1);
21785     BOOST_CHECK_EQUAL(feature::eLocationInFrame_InFrame, feature::IsLocationInFrame(fh, *loc));
21786     loc->Assign(cds->GetLocation());
21787     loc->SetInt().SetTo(loc->GetInt().GetTo() - 1);
21788     BOOST_CHECK_EQUAL(feature::eLocationInFrame_BadStop, feature::IsLocationInFrame(fh, *loc));
21789     loc->SetInt().SetTo(loc->GetInt().GetTo() - 1);
21790     BOOST_CHECK_EQUAL(feature::eLocationInFrame_BadStop, feature::IsLocationInFrame(fh, *loc));
21791     loc->SetInt().SetTo(loc->GetInt().GetTo() - 1);
21792     BOOST_CHECK_EQUAL(feature::eLocationInFrame_InFrame, feature::IsLocationInFrame(fh, *loc));
21793 
21794     loc->Assign(cds->GetLocation());
21795     loc->SetInt().SetFrom(loc->GetInt().GetFrom() + 1);
21796     loc->SetInt().SetTo(loc->GetInt().GetTo() - 1);
21797     BOOST_CHECK_EQUAL(feature::eLocationInFrame_BadStartAndStop, feature::IsLocationInFrame(fh, *loc));
21798     loc->SetInt().SetFrom(loc->GetInt().GetFrom() + 1);
21799     loc->SetInt().SetTo(loc->GetInt().GetTo() - 1);
21800     BOOST_CHECK_EQUAL(feature::eLocationInFrame_BadStartAndStop, feature::IsLocationInFrame(fh, *loc));
21801 
21802     loc->SetInt().SetFrom(cds->GetLocation().GetStop(eExtreme_Biological) + 1);
21803     loc->SetInt().SetTo(loc->GetInt().GetFrom() + 2);
21804     BOOST_CHECK_EQUAL(feature::eLocationInFrame_NotIn, feature::IsLocationInFrame(fh, *loc));
21805 
21806     CRef<CSeq_id> loc_id(new CSeq_id());
21807     loc_id->Assign(loc->GetInt().GetId());
21808     cds->SetLocation().Assign(*(unit_test_util::MakeMixLoc(loc_id)));
21809     loc->SetInt().SetFrom(cds->GetLocation().GetStart(eExtreme_Biological));
21810     loc->SetInt().SetTo(cds->GetLocation().GetStop(eExtreme_Biological));
21811     BOOST_CHECK_EQUAL(feature::eLocationInFrame_NotIn, feature::IsLocationInFrame(fh, *loc));
21812 }
21813 
s_CreateReplyWithMessage(const string & message)21814 CRef<CTaxon3_reply> s_CreateReplyWithMessage(const string& message)
21815 {
21816     CRef<CTaxon3_reply> reply(new CTaxon3_reply);
21817     CRef<CT3Reply> t3reply(new CT3Reply);
21818     t3reply->SetError().SetLevel(CT3Reply::TError::eLevel_error);
21819     t3reply->SetError().SetMessage(message);
21820     reply->SetReply().push_back(t3reply);
21821     return reply;
21822 }
21823 
21824 
21825 //removed until issues with caching and mocking service can be resolved
BOOST_AUTO_TEST_CASE(Test_Empty_Taxon_Reply)21826 BOOST_AUTO_TEST_CASE(Test_Empty_Taxon_Reply)
21827 {
21828     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
21829 
21830     CMockTaxon::TReplies replies;
21831     for (size_t i = 0; i < 50; i++) {
21832         CRef<CTaxon3_reply> reply(new CTaxon3_reply);
21833         replies.push_back(reply);
21834     }
21835 
21836     STANDARD_SETUP_WITH_MOCK_TAXON(replies);
21837 
21838     eval = validator.Validate(seh, options);
21839 
21840     expected_errors.push_back(new CExpectedError("lcl|good",
21841         eDiag_Error,
21842         "TaxonomyServiceProblem",
21843         "Taxonomy service connection failure"));
21844     //AddChromosomeNoLocation(expected_errors, entry);
21845 
21846     CheckErrors(*eval, expected_errors);
21847 
21848     CLEAR_ERRORS
21849 }
21850 
21851 
BOOST_AUTO_TEST_CASE(Test_VR_601)21852 BOOST_AUTO_TEST_CASE(Test_VR_601)
21853 {
21854     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
21855     string id_str = "ABCD123456789";
21856     CRef<CSeq_id> id(new CSeq_id());
21857     id->SetGenbank().SetAccession(id_str);
21858 
21859     unit_test_util::ChangeNucId(entry, id);
21860 
21861     STANDARD_SETUP
21862 
21863     expected_errors.push_back(new CExpectedError("gb|"+id_str+"|", eDiag_Error, "InconsistentMolInfoTechnique", "WGS accession should have Mol-info.tech of wgs"));
21864     //AddChromosomeNoLocation(expected_errors, entry);
21865     eval = validator.Validate(seh, options);
21866     CheckErrors(*eval, expected_errors);
21867 
21868     // error suppressed for TLS
21869     CLEAR_ERRORS
21870     //AddChromosomeNoLocation(expected_errors, entry);
21871     unit_test_util::SetTech(entry->SetSet().SetSeq_set().front(), CMolInfo::eTech_targeted);
21872     eval = validator.Validate(seh, options);
21873     CheckErrors(*eval, expected_errors);
21874     CLEAR_ERRORS
21875 }
21876 
21877 
BOOST_AUTO_TEST_CASE(Test_VR_612)21878 BOOST_AUTO_TEST_CASE(Test_VR_612)
21879 {
21880     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
21881     unit_test_util::SetNucProtSetProductName(entry, "This product name contains RefSeq");
21882     CRef<CSeqdesc> defline(new CSeqdesc());
21883     defline->SetTitle("This title contains RefSeq");
21884     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
21885     nuc->SetSeq().SetDescr().Set().push_back(defline);
21886 
21887     STANDARD_SETUP
21888 
21889     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "RefSeqInText", "Protein name contains 'RefSeq'"));
21890     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "RefSeqInText", "Definition line contains 'RefSeq'"));
21891     //AddChromosomeNoLocation(expected_errors, entry);
21892     eval = validator.Validate(seh, options);
21893     CheckErrors(*eval, expected_errors);
21894 
21895     CLEAR_ERRORS
21896 }
21897 
21898 
BOOST_AUTO_TEST_CASE(Test_VR_616)21899 BOOST_AUTO_TEST_CASE(Test_VR_616)
21900 {
21901     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
21902     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_strain, "yes");
21903 
21904     STANDARD_SETUP
21905 
21906     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrgModValueInvalid", "Orgmod.strain should not be 'yes'"));
21907     //AddChromosomeNoLocation(expected_errors, entry);
21908     eval = validator.Validate(seh, options);
21909     CheckErrors(*eval, expected_errors);
21910 
21911     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_strain, "");
21912     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_strain, "NO");
21913     expected_errors[0]->SetErrMsg("Orgmod.strain should not be 'NO'");
21914     eval = validator.Validate(seh, options);
21915     CheckErrors(*eval, expected_errors);
21916 
21917     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_strain, "");
21918     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_strain, "-");
21919     expected_errors[0]->SetErrMsg("Orgmod.strain should not be '-'");
21920     eval = validator.Validate(seh, options);
21921     CheckErrors(*eval, expected_errors);
21922 
21923     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_strain, "");
21924     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_strain, "microbial");
21925     expected_errors[0]->SetErrMsg("Orgmod.strain should not be 'microbial'");
21926     eval = validator.Validate(seh, options);
21927     CheckErrors(*eval, expected_errors);
21928 
21929     CLEAR_ERRORS
21930 
21931 }
21932 
BOOST_AUTO_TEST_CASE(Test_BadLocation)21933 BOOST_AUTO_TEST_CASE(Test_BadLocation)
21934 {
21935     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
21936 
21937     CRef<CSeq_feat> gene(new CSeq_feat());
21938     gene->SetData().SetGene().SetLocus("badguy");
21939     CRef<CSeq_loc> loc1(new CSeq_loc());
21940     loc1->SetInt().SetFrom(0);
21941     loc1->SetInt().SetTo(10);
21942     loc1->SetInt().SetId().SetLocal().SetStr("good1");
21943     CRef<CSeq_loc> loc2(new CSeq_loc());
21944     loc2->SetInt().SetFrom(0);
21945     loc2->SetInt().SetTo(10);
21946     loc2->SetInt().SetId().SetLocal().SetStr("good2");
21947     CRef<CSeq_loc> loc3(new CSeq_loc());
21948     loc3->SetInt().SetFrom(0);
21949     loc3->SetInt().SetTo(10);
21950     loc3->SetInt().SetId().SetLocal().SetStr("good3");
21951 
21952     gene->SetLocation().SetMix().Set().push_back(loc1);
21953     gene->SetLocation().SetMix().Set().push_back(loc2);
21954     gene->SetLocation().SetMix().Set().push_back(loc3);
21955 
21956     unit_test_util::AddFeat(gene, entry->SetSet().SetSeq_set().front());
21957 
21958     STANDARD_SETUP
21959 
21960     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "BadLocation",
21961                            "Feature location intervals should all be on the same sequence"));
21962     //AddChromosomeNoLocation(expected_errors, entry);
21963     eval = validator.Validate(seh, options);
21964     CheckErrors(*eval, expected_errors);
21965 
21966     CLEAR_ERRORS
21967 
21968     // error goes away if organelle small genome set
21969     entry->SetSet().SetClass(CBioseq_set::eClass_small_genome_set);
21970     // remove title, not appropriate for small genome set
21971     unit_test_util::RemoveDescriptorType(entry, CSeqdesc::e_Title);
21972     NON_CONST_ITERATE(CBioseq_set::TSeq_set, s, entry->SetSet().SetSeq_set()) {
21973         unit_test_util::SetGenome(*s, CBioSource::eGenome_chloroplast);
21974     }
21975     //AddChromosomeNoLocation(expected_errors, entry);
21976     eval = validator.Validate(seh, options);
21977     CheckErrors(*eval, expected_errors);
21978 
21979     CLEAR_ERRORS
21980 }
21981 
21982 
BOOST_AUTO_TEST_CASE(Test_VR_78)21983 BOOST_AUTO_TEST_CASE(Test_VR_78)
21984 {
21985     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
21986     CRef<CSeq_entry> nuc = unit_test_util::GetNucleotideSequenceFromGoodNucProtSet(entry);
21987     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
21988     CRef<CSeq_feat> prot = unit_test_util::GetProtFeatFromGoodNucProtSet(entry);
21989     CRef<CSeq_feat> mrna = unit_test_util::MakemRNAForCDS(cds);
21990     mrna->SetData().SetRna().SetExt().SetName(prot->GetData().GetProt().GetName().front());
21991     unit_test_util::AddFeat(mrna, nuc);
21992     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature(cds);
21993     unit_test_util::AddFeat(gene, nuc);
21994 
21995     STANDARD_SETUP
21996     //AddChromosomeNoLocation(expected_errors, entry);
21997     eval = validator.Validate(seh, options);
21998     CheckErrors(*eval, expected_errors);
21999 
22000     scope.RemoveTopLevelSeqEntry(seh);
22001     unit_test_util::SetNucProtSetPartials(entry, true, false);
22002     seh = scope.AddTopLevelSeqEntry(*entry);
22003 
22004     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch5Prime",
22005         "gene should not be 5' complete if coding region is 5' partial"));
22006     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch5Prime",
22007         "mRNA should not be 5' complete if coding region is 5' partial"));
22008     eval = validator.Validate(seh, options);
22009     CheckErrors(*eval, expected_errors);
22010     CLEAR_ERRORS
22011 
22012     scope.RemoveTopLevelSeqEntry(seh);
22013     unit_test_util::SetNucProtSetPartials(entry, false, true);
22014     seh = scope.AddTopLevelSeqEntry(*entry);
22015 
22016     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch3Prime",
22017         "gene should not be 3' complete if coding region is 3' partial"));
22018     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch3Prime",
22019         "mRNA should not be 3' complete if coding region is 3' partial"));
22020     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus3Prime",
22021         "3' partial is not at end of sequence, gap, or consensus splice site"));
22022     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
22023         "Got stop codon, but 3'end is labeled partial"));
22024     //AddChromosomeNoLocation(expected_errors, entry);
22025 
22026     eval = validator.Validate(seh, options);
22027     CheckErrors(*eval, expected_errors);
22028 
22029     CLEAR_ERRORS
22030 
22031     scope.RemoveTopLevelSeqEntry(seh);
22032     unit_test_util::SetNucProtSetPartials(entry, true, true);
22033     seh = scope.AddTopLevelSeqEntry(*entry);
22034 
22035     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch5Prime",
22036         "gene should not be 5' complete if coding region is 5' partial"));
22037     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch5Prime",
22038         "mRNA should not be 5' complete if coding region is 5' partial"));
22039     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch3Prime",
22040         "gene should not be 3' complete if coding region is 3' partial"));
22041     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemMismatch3Prime",
22042         "mRNA should not be 3' complete if coding region is 3' partial"));
22043     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "PartialProblemNotSpliceConsensus3Prime",
22044         "3' partial is not at end of sequence, gap, or consensus splice site"));
22045     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "PartialProblemHasStop",
22046         "Got stop codon, but 3'end is labeled partial"));
22047     //AddChromosomeNoLocation(expected_errors, entry);
22048     eval = validator.Validate(seh, options);
22049     CheckErrors(*eval, expected_errors);
22050 
22051     CLEAR_ERRORS
22052 }
22053 
22054 
BOOST_AUTO_TEST_CASE(Test_VR_166)22055 BOOST_AUTO_TEST_CASE(Test_VR_166)
22056 {
22057     string host = "Atlantic white-sided dolphin";
22058     string error_msg;
22059 
22060     BOOST_CHECK_EQUAL("Atlantic white-sided dolphin", FixSpecificHost("Atlantic white-sided dolphin"));
22061     BOOST_CHECK_EQUAL(true, IsSpecificHostValid("Atlantic white-sided dolphin", error_msg));
22062 
22063 
22064     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22065     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Atlantic white-sided dolphin");
22066 
22067     STANDARD_SETUP
22068 
22069     //AddChromosomeNoLocation(expected_errors, entry);
22070     eval = validator.Validate(seh, options);
22071     CheckErrors(*eval, expected_errors);
22072 
22073     CLEAR_ERRORS
22074 
22075 }
22076 
22077 
BOOST_AUTO_TEST_CASE(TEST_TitleNotAppropriateForSet)22078 BOOST_AUTO_TEST_CASE(TEST_TitleNotAppropriateForSet)
22079 {
22080     CRef<CSeq_entry> entry = unit_test_util::BuildGoodEcoSet();
22081     entry->SetSet().SetClass(CBioseq_set::eClass_genbank);
22082 
22083     STANDARD_SETUP
22084 
22085     eval = validator.Validate(seh, options);
22086     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error, "TitleNotAppropriateForSet",
22087         "Only Pop/Phy/Mut/Eco sets should have titles"));
22088     //AddChromosomeNoLocation(expected_errors, entry);
22089     CheckErrors(*eval, expected_errors);
22090 
22091     CLEAR_ERRORS
22092 }
22093 
22094 
BOOST_AUTO_TEST_CASE(Test_VR_664)22095 BOOST_AUTO_TEST_CASE(Test_VR_664)
22096 {
22097     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22098     CRef<CUser_object> user = edit::CGenomeAssemblyComment::MakeEmptyUserObject();
22099     edit::CGenomeAssemblyComment::SetAssemblyMethod(*user, "x v. y");
22100     CRef<CUser_field> assembly_name(new CUser_field());
22101     assembly_name->SetLabel().SetStr("Assembly Name");
22102     assembly_name->SetData().SetStr("valid value");
22103     user->SetData().push_back(assembly_name);
22104     edit::CGenomeAssemblyComment::SetGenomeCoverage(*user, "2x");
22105     edit::CGenomeAssemblyComment::SetSequencingTechnology(*user, "z");
22106     CRef<CSeqdesc> desc(new CSeqdesc());
22107     desc->SetUser().Assign(*user);
22108     entry->SetSeq().SetDescr().Set().push_back(desc);
22109 
22110     STANDARD_SETUP
22111 
22112     //AddChromosomeNoLocation(expected_errors, entry);
22113     eval = validator.Validate(seh, options);
22114     CheckErrors(*eval, expected_errors);
22115 
22116     assembly_name->SetData().SetStr("not,valid");
22117     desc->SetUser().Assign(*user);
22118     eval = validator.Validate(seh, options);
22119     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info,
22120                                  "BadStrucCommInvalidFieldValue",
22121                                  "Structured Comment invalid; the field value and/or name are incorrect"));
22122     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
22123                                  "BadStrucCommInvalidFieldValue",
22124                                  "not,valid is not a valid value for Assembly Name"));
22125     CheckErrors(*eval, expected_errors);
22126 
22127     CLEAR_ERRORS
22128 
22129     //AddChromosomeNoLocation(expected_errors, entry);
22130     assembly_name->SetData().SetStr("Ec2009C-3227");
22131     desc->SetUser().Assign(*user);
22132     eval = validator.Validate(seh, options);
22133     CheckErrors(*eval, expected_errors);
22134 
22135     assembly_name->SetData().SetStr("Anop_step_SDA-500_V1");
22136     desc->SetUser().Assign(*user);
22137     eval = validator.Validate(seh, options);
22138     CheckErrors(*eval, expected_errors);
22139 
22140     CLEAR_ERRORS
22141 }
22142 
22143 
BOOST_AUTO_TEST_CASE(Test_VR_478)22144 BOOST_AUTO_TEST_CASE(Test_VR_478)
22145 {
22146     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22147     CRef<CSeq_feat> gene(new CSeq_feat());
22148     gene->SetData().SetGene().SetLocus("a");
22149     CRef<CSeq_loc> int1(new CSeq_loc());
22150     int1->SetInt().SetId().Assign(*(entry->GetSeq().GetId().front()));
22151     int1->SetInt().SetFrom(0);
22152     int1->SetInt().SetTo(5);
22153     CRef<CSeq_loc> int2(new CSeq_loc());
22154     int2->SetInt().SetId().Assign(*(entry->GetSeq().GetId().front()));
22155     int2->SetInt().SetFrom(10);
22156     int2->SetInt().SetTo(15);
22157     gene->SetLocation().SetMix().Set().push_back(int1);
22158     gene->SetLocation().SetMix().Set().push_back(int2);
22159     unit_test_util::AddFeat(gene, entry);
22160 
22161     CRef<CSeq_feat> mobile_element(new CSeq_feat());
22162     mobile_element->SetData().SetImp().SetKey("mobile_element");
22163     mobile_element->SetLocation().SetInt().SetId().Assign(*(entry->GetSeq().GetId().front()));
22164     mobile_element->SetLocation().SetInt().SetFrom(6);
22165     mobile_element->SetLocation().SetInt().SetTo(9);
22166     CRef<CGb_qual> qual(new CGb_qual("mobile_element_type", "superintegron"));
22167     mobile_element->SetQual().push_back(qual);
22168     unit_test_util::AddFeat(mobile_element, entry);
22169 
22170     STANDARD_SETUP
22171 
22172     //AddChromosomeNoLocation(expected_errors, entry);
22173     eval = validator.Validate(seh, options);
22174     CheckErrors(*eval, expected_errors);
22175 
22176     CLEAR_ERRORS
22177 }
22178 
22179 
BOOST_AUTO_TEST_CASE(Test_VR_630)22180 BOOST_AUTO_TEST_CASE(Test_VR_630)
22181 {
22182     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22183     CRef<CSeq_feat> gene = unit_test_util::AddMiscFeature(entry);
22184     gene->SetData().SetGene().SetLocus("X");
22185     gene->SetExcept(true);
22186     gene->SetExcept_text("trans-splicing");
22187 
22188     STANDARD_SETUP
22189 
22190     eval = validator.Validate(seh, options);
22191     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
22192         "BadTranssplicedInterval",
22193         "Trans-spliced feature should have multiple intervals"));
22194     //AddChromosomeNoLocation(expected_errors, entry);
22195     CheckErrors(*eval, expected_errors);
22196 
22197     CLEAR_ERRORS
22198 }
22199 
22200 
BOOST_AUTO_TEST_CASE(Test_VR_660)22201 BOOST_AUTO_TEST_CASE(Test_VR_660)
22202 {
22203     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22204     CRef<CSeq_feat> recomb = unit_test_util::AddMiscFeature(entry);
22205     recomb->SetData().SetImp().SetKey("misc_recomb");
22206     CRef<CGb_qual> qual(new CGb_qual("recombination_class", "other"));
22207     recomb->SetQual().push_back(qual);
22208 
22209     STANDARD_SETUP
22210 
22211     // first check ok because recomb has comment
22212     //AddChromosomeNoLocation(expected_errors, entry);
22213     eval = validator.Validate(seh, options);
22214     CheckErrors(*eval, expected_errors);
22215 
22216     // error because 'other' and no comment
22217     recomb->ResetComment();
22218     eval = validator.Validate(seh, options);
22219     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
22220         "RecombinationClassOtherNeedsNote",
22221         "The recombination_class 'other' is missing the required /note"));
22222     CheckErrors(*eval, expected_errors);
22223 
22224     // info because not other and not valid
22225     // removed per VR-770
22226     // qual->SetVal("not a valid recombination class");
22227     // expected_errors[0]->SetErrMsg("'not a valid recombination class' is not a legal value for recombination_class");
22228     // expected_errors[0]->SetSeverity(eDiag_Info);
22229     // eval = validator.Validate(seh, options);
22230     // CheckErrors(*eval, expected_errors);
22231 
22232     CLEAR_ERRORS
22233 
22234     // no error because legal
22235     qual->SetVal("mitotic");
22236     //AddChromosomeNoLocation(expected_errors, entry);
22237     eval = validator.Validate(seh, options);
22238     CheckErrors(*eval, expected_errors);
22239 
22240     CLEAR_ERRORS
22241 }
22242 
22243 
AddOrgmod(COrg_ref & org,const string & val,COrgMod::ESubtype subtype)22244 void AddOrgmod(COrg_ref& org, const string& val, COrgMod::ESubtype subtype)
22245 {
22246     CRef<COrgMod> om(new COrgMod(subtype, val));
22247     org.SetOrgname().SetMod().push_back(om);
22248 }
22249 
22250 
AddOrgmodDescriptor(CRef<CSeq_entry> entry,const string & val,COrgMod::ESubtype subtype)22251 void AddOrgmodDescriptor(CRef<CSeq_entry> entry, const string& val, COrgMod::ESubtype subtype)
22252 {
22253     CRef<CSeqdesc> src_desc(new CSeqdesc());
22254     // should look up
22255     src_desc->SetSource().SetOrg().SetTaxname("Influenza A virus");
22256     AddOrgmod(src_desc->SetSource().SetOrg(), val, subtype);
22257     entry->SetDescr().Set().push_back(src_desc);
22258 }
22259 
AddOrgmodFeat(CRef<CSeq_entry> entry,const string & val,COrgMod::ESubtype subtype)22260 void AddOrgmodFeat(CRef<CSeq_entry> entry, const string& val, COrgMod::ESubtype subtype)
22261 {
22262     CRef<CSeq_feat> src_feat = unit_test_util::AddMiscFeature(entry);
22263     src_feat->SetData().SetBiosrc().SetOrg().SetTaxname("Influenza virus A");
22264     AddOrgmod(src_feat->SetData().SetBiosrc().SetOrg(), val, subtype);
22265 }
22266 
22267 typedef vector< pair<string, string> > THostStringsVector;
22268 
22269 
TestBulkSpecificHostFixList(const THostStringsVector & test_values)22270 void TestBulkSpecificHostFixList(const THostStringsVector& test_values)
22271 {
22272     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22273 
22274     vector<CRef<COrg_ref> > original;
22275     vector<CRef<COrg_ref> > to_adjust;
22276 
22277     ITERATE(THostStringsVector, it, test_values) {
22278         AddOrgmodDescriptor(entry, it->first, COrgMod::eSubtype_nat_host);
22279         AddOrgmodFeat(entry, it->first, COrgMod::eSubtype_nat_host);
22280         CRef<COrg_ref> org(new COrg_ref());
22281         org->SetTaxname("foo");
22282         CRef<COrgMod> om(new COrgMod(COrgMod::eSubtype_nat_host, it->first));
22283         org->SetOrgname().SetMod().push_back(om);
22284         to_adjust.push_back(org);
22285         CRef<COrg_ref> cpy(new COrg_ref());
22286         cpy->Assign(*org);
22287         original.push_back(cpy);
22288     }
22289     string error_message;
22290 
22291     CTaxValidationAndCleanup tval;
22292     tval.Init(*entry);
22293     vector<CRef<COrg_ref> > org_rq_list = tval.GetSpecificHostLookupRequest(true);
22294 
22295     objects::CTaxon3 taxon3;
22296     taxon3.Init();
22297     CRef<CTaxon3_reply> reply = taxon3.SendOrgRefList(org_rq_list);
22298     BOOST_CHECK_EQUAL(reply->GetReply().size(), org_rq_list.size());
22299 
22300     BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithSpecificHostReply(org_rq_list, *reply, to_adjust, error_message), true);
22301 
22302     vector<CRef<COrg_ref> >::iterator org = to_adjust.begin();
22303     vector<CRef<COrg_ref> >::iterator cpy = original.begin();
22304     while (org != to_adjust.cend()) {
22305         const string& before = (*cpy)->GetOrgname().GetMod().front()->GetSubname();
22306         const string& after = (*org)->GetOrgname().GetMod().front()->GetSubname();
22307         THostStringsVector::const_iterator tvit = test_values.cbegin();
22308         while (tvit != test_values.cend() && !NStr::Equal(tvit->first, before)) {
22309             ++tvit;
22310         }
22311 
22312         BOOST_CHECK_EQUAL(after, tvit->second);
22313         ++org;
22314         ++cpy;
22315         ++tvit;
22316     }
22317 }
22318 
BOOST_AUTO_TEST_CASE(Test_SQD_4354)22319 BOOST_AUTO_TEST_CASE(Test_SQD_4354)
22320 {
22321     THostStringsVector test_values;
22322     test_values.push_back(pair<string, string>("Zymomonas anaerobia", "Zymomonas mobilis"));
22323     TestBulkSpecificHostFixList(test_values);
22324 
22325     test_values.clear();
22326     test_values.push_back(pair<string, string>("Zymononas mobilis", "Zymomonas mobilis"));
22327     TestBulkSpecificHostFixList(test_values);
22328 }
22329 
22330 
BOOST_AUTO_TEST_CASE(Test_BulkSpecificHostFix)22331 BOOST_AUTO_TEST_CASE(Test_BulkSpecificHostFix)
22332 {
22333     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22334 
22335     THostStringsVector test_values;
22336     test_values.push_back(pair<string, string>("Homo supiens", "Homo supiens")); // non-fixable spelling problem
22337     test_values.push_back(pair<string, string>("HUMAN", "Homo sapiens"));
22338     TestBulkSpecificHostFixList(test_values);
22339     test_values.push_back(pair<string, string>("Homo sapiens", "Homo sapiens"));
22340     TestBulkSpecificHostFixList(test_values);
22341     test_values.push_back(pair<string, string>("Gallus Gallus", "Gallus gallus"));
22342     TestBulkSpecificHostFixList(test_values);
22343     test_values.push_back(pair<string, string>("Conservemos nuestros", "Conservemos nuestros")); // non-fixable spelling problem
22344     TestBulkSpecificHostFixList(test_values);
22345     test_values.push_back(pair<string, string>("Pinus sp.", "Pinus sp.")); // ambiguous
22346     TestBulkSpecificHostFixList(test_values);
22347     test_values.push_back(pair<string, string>("Eschericia coli", "Escherichia coli")); // fixable spelling problem
22348     TestBulkSpecificHostFixList(test_values);
22349     test_values.push_back(pair<string, string>("Avian", "Avian"));
22350     TestBulkSpecificHostFixList(test_values);
22351     test_values.push_back(pair<string, string>("Bovine", "Bovine"));
22352     TestBulkSpecificHostFixList(test_values);
22353     test_values.push_back(pair<string, string>("Pig", "Pig"));
22354     TestBulkSpecificHostFixList(test_values);
22355     test_values.push_back(pair<string, string>(" Chicken", "Chicken")); // truncate space
22356     TestBulkSpecificHostFixList(test_values);
22357     test_values.push_back(pair<string, string>("Homo sapiens; sex: female", "Homo sapiens; sex: female"));
22358     TestBulkSpecificHostFixList(test_values);
22359     test_values.push_back(pair<string, string>("Atlantic white-sided dolphin", "Atlantic white-sided dolphin"));
22360     TestBulkSpecificHostFixList(test_values);
22361     test_values.push_back(pair<string, string>("Zymomonas anaerobia", "Zymomonas mobilis"));
22362     TestBulkSpecificHostFixList(test_values);
22363 
22364     vector<CRef<COrg_ref> > to_adjust;
22365     vector<CRef<COrg_ref> > original;
22366 
22367     ITERATE(THostStringsVector, it, test_values) {
22368         AddOrgmodDescriptor(entry, it->first, COrgMod::eSubtype_nat_host);
22369         AddOrgmodFeat(entry, it->first, COrgMod::eSubtype_nat_host);
22370         CRef<COrg_ref> org(new COrg_ref());
22371         org->SetTaxname("foo");
22372         CRef<COrgMod> om(new COrgMod(COrgMod::eSubtype_nat_host, it->first));
22373         org->SetOrgname().SetMod().push_back(om);
22374         to_adjust.push_back(org);
22375         CRef<COrg_ref> cpy(new COrg_ref());
22376         cpy->Assign(*org);
22377         original.push_back(cpy);
22378     }
22379     string error_message;
22380 
22381     CTaxValidationAndCleanup tval;
22382     tval.Init(*entry);
22383     vector<CRef<COrg_ref> > org_rq_list = tval.GetSpecificHostLookupRequest(true);
22384     // don't create update requests for single-word values
22385     // Homo sapiens is ignored because "HUMAN" already corrects to it
22386     BOOST_CHECK_EQUAL(org_rq_list.size(), test_values.size() - 6);
22387 
22388     objects::CTaxon3 taxon3;
22389     taxon3.Init();
22390     CRef<CTaxon3_reply> reply = taxon3.SendOrgRefList(org_rq_list);
22391 
22392     BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithSpecificHostReply(org_rq_list, *reply, to_adjust, error_message), true);
22393 
22394     vector<CRef<COrg_ref> >::iterator org = to_adjust.begin();
22395     vector<CRef<COrg_ref> >::iterator cpy = original.begin();
22396     while (org != to_adjust.cend()) {
22397         const string& before = (*cpy)->GetOrgname().GetMod().front()->GetSubname();
22398         const string& after = (*org)->GetOrgname().GetMod().front()->GetSubname();
22399         THostStringsVector::const_iterator tvit = test_values.cbegin();
22400         while (tvit != test_values.cend() && !NStr::Equal(tvit->first, before)) {
22401             ++tvit;
22402         }
22403 
22404         BOOST_CHECK_EQUAL(after, tvit->second);
22405         ++org;
22406         ++cpy;
22407         ++tvit;
22408     }
22409 
22410     CRef<COrg_ref> test_src(new COrg_ref());
22411     AddOrgmod(*test_src, "Conservemos nuestros", COrgMod::eSubtype_nat_host); // don't change because bad
22412     AddOrgmod(*test_src, "Pinus sp.", COrgMod::eSubtype_nat_host); // don't change because ambivalent
22413     AddOrgmod(*test_src, "Eschericia coli", COrgMod::eSubtype_nat_host); // change because spelling
22414 
22415     to_adjust.clear();
22416     to_adjust.push_back(test_src);
22417     BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithSpecificHostReply(org_rq_list, *reply, to_adjust, error_message), true);
22418     COrgName::TMod::const_iterator m = test_src->GetOrgname().GetMod().begin();
22419     BOOST_CHECK_EQUAL((*m)->GetSubname(), "Conservemos nuestros");
22420     ++m;
22421     BOOST_CHECK_EQUAL((*m)->GetSubname(), "Pinus sp.");
22422     ++m;
22423     BOOST_CHECK_EQUAL((*m)->GetSubname(), "Escherichia coli");
22424     // already fixed all problems, don't fix again
22425     BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithSpecificHostReply(org_rq_list, *reply, to_adjust, error_message), false);
22426     m = test_src->GetOrgname().GetMod().begin();
22427     BOOST_CHECK_EQUAL((*m)->GetSubname(), "Conservemos nuestros");
22428     ++m;
22429     BOOST_CHECK_EQUAL((*m)->GetSubname(), "Pinus sp.");
22430     ++m;
22431     BOOST_CHECK_EQUAL((*m)->GetSubname(), "Escherichia coli");
22432 
22433     vector< CRef<COrg_ref> > original_orgs = tval.GetTaxonomyLookupRequest();
22434     vector< CRef<COrg_ref> > edited_orgs = tval.GetTaxonomyLookupRequest();
22435     CRef<CTaxon3_reply> lookup_reply = taxon3.SendOrgRefList(original_orgs);
22436     BOOST_CHECK_EQUAL(lookup_reply->GetReply().size(), original_orgs.size());
22437     BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithTaxLookupReply(*lookup_reply, edited_orgs, error_message), true);
22438     // second time should produce no additional changes
22439     BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithTaxLookupReply(*lookup_reply, edited_orgs, error_message), false);
22440     vector< CRef<COrg_ref> > spec_host_rq = tval.GetSpecificHostLookupRequest(true);
22441     CRef<CTaxon3_reply> spec_host_reply = taxon3.SendOrgRefList(spec_host_rq);
22442     BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithSpecificHostReply(org_rq_list, *spec_host_reply, edited_orgs, error_message), true);
22443     // second time should produce no additional changes
22444     BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithSpecificHostReply(org_rq_list, *spec_host_reply, edited_orgs, error_message), false);
22445 
22446     size_t num_descs = tval.NumDescs();
22447     size_t num_updated_descs = 0;
22448     for (size_t n = 0; n < num_descs; n++) {
22449         if (!original_orgs[n]->Equals(*(edited_orgs[n]))) {
22450             CConstRef<CSeqdesc> desc = tval.GetDesc(n);
22451             CRef<CSeqdesc> new_desc(new CSeqdesc());
22452             new_desc->Assign(*desc);
22453             new_desc->SetSource().SetOrg().Assign(*(edited_orgs[n]));
22454             num_updated_descs++;
22455         }
22456     }
22457     // we expect that all descs will be updated, because they have a recognizable taxname but none of the other data
22458     BOOST_CHECK_EQUAL(num_updated_descs, num_descs);
22459 
22460     size_t num_updated_feats = 0;
22461     for (size_t n = 0; n < tval.NumFeats(); n++) {
22462         if (!original_orgs[n + num_descs]->Equals(*edited_orgs[n + num_descs])) {
22463             CConstRef<CSeq_feat> feat = tval.GetFeat(n);
22464             CRef<CSeq_feat> new_feat(new CSeq_feat());
22465             new_feat->Assign(*feat);
22466             new_feat->SetData().SetBiosrc().SetOrg().Assign(*(edited_orgs[n]));
22467             num_updated_feats++;
22468         }
22469     }
22470     // only five of the feats will be updated, because their taxnames cannot be
22471     // recognized, and only five of the specific hosts are altered.
22472     BOOST_CHECK_EQUAL(num_updated_feats, (size_t)5);
22473 }
22474 
22475 
BOOST_AUTO_TEST_CASE(Test_VR_787)22476 BOOST_AUTO_TEST_CASE(Test_VR_787)
22477 {
22478     CRef<COrg_ref> org(new COrg_ref());
22479 
22480     org->SetTaxname("Dickeya dadantii subsp. dieffenbachiae");
22481     CRef<CDbtag> dbtag(new CDbtag());
22482     dbtag->SetDb("taxon");
22483     dbtag->SetTag().SetId(204040);
22484     org->SetDb().push_back(dbtag);
22485     org->SetOrgname().SetName().SetBinomial().SetGenus("Dickeya");
22486     org->SetOrgname().SetName().SetBinomial().SetSpecies("dadantii");
22487     org->SetOrgname().SetName().SetBinomial().SetSubspecies("dieffenbachiae");
22488     org->SetOrgname().SetMod().push_back(CRef<COrgMod>(new COrgMod(COrgMod::eSubtype_strain, "PA1")));
22489     org->SetOrgname().SetMod().push_back(CRef<COrgMod>(new COrgMod(COrgMod::eSubtype_nat_host, "Phalaenopsis sp. (orchid)")));
22490     org->SetOrgname().SetMod().push_back(CRef<COrgMod>(new COrgMod(COrgMod::eSubtype_sub_species, "dieffenbachiae")));
22491     org->SetOrgname().SetLineage("Bacteria; Proteobacteria; Gammaproteobacteria");
22492     org->SetOrgname().SetGcode(11);
22493     org->SetOrgname().SetDiv("BCT");
22494 
22495     vector<CRef<COrg_ref> > org_rq;
22496     org_rq.push_back(org);
22497 
22498     vector<CRef<COrg_ref> > edited_orgs;
22499     CRef<COrg_ref> cpy(new COrg_ref());
22500     cpy->Assign(*org);
22501     edited_orgs.push_back(cpy);
22502 
22503     CTaxValidationAndCleanup tval;
22504 
22505     objects::CTaxon3 taxon3;
22506     taxon3.Init();
22507 
22508     CRef<CTaxon3_reply> org_reply = taxon3.SendOrgRefList(org_rq);
22509     string error_message;
22510     BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithTaxLookupReply(*org_reply, edited_orgs, error_message), true);
22511     BOOST_CHECK_EQUAL(cpy->GetTaxname(), "Dickeya fangzhongdai");
22512 
22513     org->Reset();
22514     org->SetTaxname("Alnus cordata");
22515     dbtag->SetTag().SetId(109058);
22516     org->SetDb().push_back(dbtag);
22517     org->SetOrgname().SetName().SetBinomial().SetGenus("Alnus");
22518     org->SetOrgname().SetName().SetBinomial().SetSpecies("cordata");
22519     org->SetOrgname().SetAttrib("specified");
22520     org->SetOrgname().SetMod().push_back(CRef<COrgMod>(new COrgMod(COrgMod::eSubtype_isolate, "AZ12-2")));
22521     org->SetOrgname().SetMod().push_back(CRef<COrgMod>(new COrgMod(COrgMod::eSubtype_sub_species, "Alnus cordata AZ12-2 chloroplast, complete genome")));
22522     org->SetOrgname().SetLineage("Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta; Spermatophyta; Magnoliophyta; eudicotyledons; Gunneridae; Pentapetalae; rosids; fabids; Fagales; Betulaceae; Alnus");
22523     org->SetOrgname().SetGcode(1);
22524     org->SetOrgname().SetMgcode(1);
22525     org->SetOrgname().SetDiv("PLN");
22526     org->SetOrgname().SetPgcode(11);
22527 
22528     cpy->Assign(*org);
22529 
22530     org_reply = taxon3.SendOrgRefList(org_rq);
22531     BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithTaxLookupReply(*org_reply, edited_orgs, error_message), false);
22532     BOOST_CHECK_EQUAL(tval.AdjustOrgRefsWithTaxLookupReply(*org_reply, edited_orgs, error_message, true), true);
22533     BOOST_CHECK_EQUAL(cpy->GetTaxname(), "Alnus cordata subsp. Alnus cordata AZ12-2 chloroplast, complete genome");
22534 
22535 }
22536 
22537 
BOOST_AUTO_TEST_CASE(Test_BulkSpecificHostFixIncremental)22538 BOOST_AUTO_TEST_CASE(Test_BulkSpecificHostFixIncremental)
22539 {
22540     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22541 
22542     THostStringsVector test_values;
22543     test_values.push_back(pair<string, string>("Homo supiens", "Homo supiens")); // non-fixable spelling problem
22544     test_values.push_back(pair<string, string>("HUMAN", "Homo sapiens"));
22545     test_values.push_back(pair<string, string>("Homo sapiens", "Homo sapiens"));
22546     test_values.push_back(pair<string, string>("Pinus sp.", "Pinus sp.")); // ambiguous
22547     test_values.push_back(pair<string, string>("Gallus Gallus", "Gallus gallus"));
22548     test_values.push_back(pair<string, string>("Eschericia coli", "Escherichia coli")); // fixable spelling problem
22549     test_values.push_back(pair<string, string>("Avian", "Avian"));
22550     test_values.push_back(pair<string, string>("Bovine", "Bovine"));
22551     test_values.push_back(pair<string, string>("Pig", "Pig"));
22552     test_values.push_back(pair<string, string>(" Chicken", "Chicken")); // truncate space
22553     test_values.push_back(pair<string, string>("Homo sapiens; sex: female", "Homo sapiens; sex: female"));
22554     test_values.push_back(pair<string, string>("Atlantic white-sided dolphin", "Atlantic white-sided dolphin"));
22555 
22556     vector<CRef<COrg_ref> > to_adjust;
22557 
22558     ITERATE(THostStringsVector, it, test_values) {
22559         AddOrgmodDescriptor(entry, it->first, COrgMod::eSubtype_nat_host);
22560         AddOrgmodFeat(entry, it->first, COrgMod::eSubtype_nat_host);
22561         CRef<COrg_ref> org(new COrg_ref());
22562         org->SetTaxname("foo");
22563         AddOrgmod(*org, it->first, COrgMod::eSubtype_nat_host);
22564         to_adjust.push_back(org);
22565     }
22566     string error_message;
22567 
22568     CTaxValidationAndCleanup tval;
22569     tval.Init(*entry);
22570     vector<CRef<COrg_ref> > spec_host_rq = tval.GetSpecificHostLookupRequest(true);
22571     // don't create update requests for single-word values
22572     // Homo sapiens is ignored because "HUMAN" already corrects to it
22573     BOOST_CHECK_EQUAL(spec_host_rq.size(), test_values.size() - 6);
22574 
22575     objects::CTaxon3 taxon3;
22576     taxon3.Init();
22577 
22578     size_t chunk_size = 3;
22579     size_t i = 0;
22580     while (i < spec_host_rq.size()) {
22581         size_t len = min(chunk_size, spec_host_rq.size() - i);
22582         vector< CRef<COrg_ref> >  tmp_rq(spec_host_rq.begin() + i, spec_host_rq.begin() + i + len);
22583         CRef<CTaxon3_reply> tmp_spec_host_reply = taxon3.SendOrgRefList(tmp_rq);
22584         BOOST_CHECK_EQUAL(tval.IncrementalSpecificHostMapUpdate(tmp_rq, *tmp_spec_host_reply), kEmptyStr);
22585         i += chunk_size;
22586     }
22587 
22588     BOOST_CHECK_EQUAL(tval.IsSpecificHostMapUpdateComplete(), true);
22589 
22590     BOOST_CHECK_EQUAL(tval.AdjustOrgRefsForSpecificHosts(to_adjust), true);
22591 
22592     vector<CRef<COrg_ref> >::iterator org = to_adjust.begin();
22593     THostStringsVector::iterator tvit = test_values.begin();
22594     while (org != to_adjust.end()) {
22595         BOOST_CHECK_EQUAL((*org)->GetOrgname().GetMod().front()->GetSubname(), tvit->second);
22596         ++org;
22597         ++tvit;
22598     }
22599 
22600 }
22601 
22602 
AddStrainDescriptor(CSeq_entry & entry,const string & taxname,const string & strain,const string & lineage)22603 void AddStrainDescriptor(CSeq_entry& entry, const string& taxname, const string& strain, const string& lineage)
22604 {
22605     CRef<CSeqdesc> src_desc(new CSeqdesc());
22606     // should look up
22607     src_desc->SetSource().SetOrg().SetTaxname(taxname);
22608     AddOrgmod(src_desc->SetSource().SetOrg(), strain, COrgMod::eSubtype_strain);
22609     src_desc->SetSource().SetOrg().SetOrgname().SetLineage(lineage);
22610     entry.SetDescr().Set().push_back(src_desc);
22611 }
22612 
22613 
TestOneStrain(const string & taxname,const string & strain,const string & lineage,bool expect_err)22614 void TestOneStrain(const string& taxname, const string& strain, const string& lineage, bool expect_err)
22615 {
22616     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22617     CBioseq::TDescr::Tdata::iterator it = entry->SetSeq().SetDescr().Set().begin();
22618     while (it != entry->SetSeq().SetDescr().Set().end()) {
22619         if ((*it)->IsSource()) {
22620             it = entry->SetSeq().SetDescr().Set().erase(it);
22621         } else {
22622             ++it;
22623         }
22624     }
22625     AddStrainDescriptor(*entry, taxname, strain, lineage); // expect no report
22626     STANDARD_SETUP
22627 
22628     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NoTaxonID",
22629         "BioSource is missing taxon ID"));
22630     if (expect_err) {
22631         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "StrainContainsTaxInfo",
22632             "Strain '" + strain + "' contains taxonomic name information"));
22633     }
22634 
22635     eval = validator.Validate(seh, options);
22636     CheckErrors(*eval, expected_errors);
22637 
22638     CLEAR_ERRORS
22639 
22640 }
22641 
22642 
BOOST_AUTO_TEST_CASE(Test_BulkStrainIncremental)22643 BOOST_AUTO_TEST_CASE(Test_BulkStrainIncremental)
22644 {
22645     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22646 
22647     AddStrainDescriptor(*entry, "Gorilla gorilla", "abc", "xyz"); // expect no report
22648     AddStrainDescriptor(*entry, "Gorilla gorilla", "Aeromonas punctata", "xyz"); // expect a report
22649     AddStrainDescriptor(*entry, "Gorilla gorilla", "Klebsiella_quasipneumoniae", "xyz"); // expect a report
22650     AddStrainDescriptor(*entry, "Bacillus sp.", "cereus", "xyz");
22651     AddStrainDescriptor(*entry, "Hippopotamus amphibius", "giraffe cow", "xyz"); // no error - giraffe looks up but is not in taxname
22652 
22653     string error_message;
22654 
22655     CTaxValidationAndCleanup tval;
22656     tval.Init(*entry);
22657 
22658     vector<CRef<COrg_ref> > strain_rq = tval.GetStrainLookupRequest();
22659     BOOST_CHECK_EQUAL(strain_rq.size(), (size_t)9);
22660 
22661     objects::CTaxon3 taxon3;
22662     taxon3.Init();
22663 
22664     size_t chunk_size = 3;
22665     size_t i = 0;
22666     while (i < strain_rq.size()) {
22667         size_t len = min(chunk_size, strain_rq.size() - i);
22668         vector< CRef<COrg_ref> >  tmp_rq(strain_rq.begin() + i, strain_rq.begin() + i + len);
22669         CRef<CTaxon3_reply> tmp_strain_reply = taxon3.SendOrgRefList(tmp_rq);
22670         BOOST_CHECK_EQUAL(tval.IncrementalStrainMapUpdate(tmp_rq, *tmp_strain_reply), kEmptyStr);
22671         i += chunk_size;
22672     }
22673 
22674     BOOST_CHECK_EQUAL(tval.IsStrainMapUpdateComplete(), true);
22675 
22676     // commented out until TM-725 is resolved
22677     TestOneStrain("Hippopotamus amphibius", "giraffe cow", "xyz", false); // no error - giraffe looks up but is not in taxname
22678     TestOneStrain("Gorilla gorilla", "abc", "xyz", false);
22679     TestOneStrain("Gorilla gorilla", "Aeromonas punctata", "xyz", true);
22680     TestOneStrain("Gorilla gorilla", "Klebsiella_quasipneumoniae", "xyz", true);
22681     TestOneStrain("Bacillus sp.", "cereus", "xyz", true);
22682 
22683     TestOneStrain("Ralstonia phage phiRSL1", "Aeromonas punctata", "xyz", false);
22684     TestOneStrain("Gorilla gorilla", "Aeromonas punctata", "viroid", false);
22685     TestOneStrain("Acetobacter sp.", "DsW_063", "Bacteria", false);
22686 }
22687 
22688 
BOOST_AUTO_TEST_CASE(VR_762)22689 BOOST_AUTO_TEST_CASE(VR_762)
22690 {
22691     TestOneStrain("Cystobasidium minutum", "P22", "xyz", false);
22692 }
22693 
BOOST_AUTO_TEST_CASE(TEST_VR_477)22694 BOOST_AUTO_TEST_CASE(TEST_VR_477)
22695 {
22696     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
22697     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
22698 
22699     CRef<CCode_break> codebreak(new CCode_break());
22700     codebreak->SetLoc().SetInt().SetId().SetLocal().SetStr("nuc");
22701     codebreak->SetLoc().SetInt().SetFrom(24);
22702     codebreak->SetLoc().SetInt().SetTo(26);
22703     codebreak->SetLoc().SetPartialStop(true, eExtreme_Positional);
22704     cds->SetData().SetCdregion().SetCode_break().push_back(codebreak);
22705 
22706     STANDARD_SETUP
22707 
22708     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TranslExceptIsPartial",
22709         "Translation exception locations should not be partial"));
22710     //AddChromosomeNoLocation(expected_errors, entry);
22711     eval = validator.Validate(seh, options);
22712     CheckErrors(*eval, expected_errors);
22713     CLEAR_ERRORS
22714 }
22715 
22716 
BOOST_AUTO_TEST_CASE(Test_VR_35)22717 BOOST_AUTO_TEST_CASE(Test_VR_35)
22718 {
22719     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22720     CRef<CSeq_feat> exon = unit_test_util::AddMiscFeature(entry);
22721     exon->SetData().SetImp().SetKey("exon");
22722     exon->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("number", "group I")));
22723 
22724     STANDARD_SETUP
22725 
22726     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidNumberQualifier",
22727         "Number qualifiers should not contain spaces"));
22728     //AddChromosomeNoLocation(expected_errors, entry);
22729     eval = validator.Validate(seh, options);
22730     CheckErrors(*eval, expected_errors);
22731     CLEAR_ERRORS
22732 }
22733 
22734 
BOOST_AUTO_TEST_CASE(TEST_VR_15)22735 BOOST_AUTO_TEST_CASE(TEST_VR_15)
22736 {
22737     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22738     CRef<CSeq_feat> feat = unit_test_util::AddMiscFeature(entry);
22739     feat->SetLocation().SetInt().SetFrom(0);
22740     feat->SetLocation().SetInt().SetFuzz_from().SetLim(CInt_fuzz::eLim_tl);
22741     feat->SetLocation().SetInt().SetTo(entry->GetSeq().GetInst().GetLength() - 1);
22742 
22743     STANDARD_SETUP
22744 
22745     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
22746     "Should not specify 'space to left' at first position of non-circular sequence"));
22747     //AddChromosomeNoLocation(expected_errors, entry);
22748     eval = validator.Validate(seh, options);
22749     CheckErrors(*eval, expected_errors);
22750 
22751     CLEAR_ERRORS
22752 
22753     scope.RemoveTopLevelSeqEntry(seh);
22754     feat->SetLocation().SetInt().SetFuzz_from().SetLim(CInt_fuzz::eLim_tr);
22755     seh = scope.AddTopLevelSeqEntry(*entry);
22756     // not an error
22757     //AddChromosomeNoLocation(expected_errors, entry);
22758     eval = validator.Validate(seh, options);
22759     CheckErrors(*eval, expected_errors);
22760 
22761     CLEAR_ERRORS
22762 
22763     scope.RemoveTopLevelSeqEntry(seh);
22764     feat->SetLocation().SetInt().ResetFuzz_from();
22765     feat->SetLocation().SetInt().SetFuzz_to().SetLim(CInt_fuzz::eLim_tr);
22766     seh = scope.AddTopLevelSeqEntry(*entry);
22767     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
22768         "Should not specify 'space to right' at last position of non-circular sequence"));
22769     //AddChromosomeNoLocation(expected_errors, entry);
22770     eval = validator.Validate(seh, options);
22771     CheckErrors(*eval, expected_errors);
22772 
22773     CLEAR_ERRORS
22774     //suppress if circular
22775     scope.RemoveTopLevelSeqEntry(seh);
22776     entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
22777     seh = scope.AddTopLevelSeqEntry(*entry);
22778     eval = validator.Validate(seh, options);
22779     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "CompleteCircleProblem", "Circular topology without complete flag set"));
22780     //AddChromosomeNoLocation(expected_errors, entry);
22781     CheckErrors(*eval, expected_errors);
22782 
22783     // also suppress for point
22784     scope.RemoveTopLevelSeqEntry(seh);
22785     feat->SetLocation().SetPnt().SetId().Assign(*(entry->GetSeq().GetId().front()));
22786     feat->SetLocation().SetPnt().SetPoint(0);
22787     feat->SetLocation().SetPnt().SetFuzz().SetLim(CInt_fuzz::eLim_tl);
22788     seh = scope.AddTopLevelSeqEntry(*entry);
22789     eval = validator.Validate(seh, options);
22790     CheckErrors(*eval, expected_errors);
22791     CLEAR_ERRORS
22792 
22793     scope.RemoveTopLevelSeqEntry(seh);
22794     entry->SetSeq().SetInst().ResetTopology();
22795     seh = scope.AddTopLevelSeqEntry(*entry);
22796     eval = validator.Validate(seh, options);
22797     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
22798         "Should not specify 'space to left' at first position of non-circular sequence"));
22799     //AddChromosomeNoLocation(expected_errors, entry);
22800     CheckErrors(*eval, expected_errors);
22801 
22802     CLEAR_ERRORS
22803 
22804     scope.RemoveTopLevelSeqEntry(seh);
22805     feat->SetLocation().SetPnt().SetPoint(entry->GetSeq().GetInst().GetLength() - 1);
22806     feat->SetLocation().SetPnt().SetFuzz().SetLim(CInt_fuzz::eLim_tr);
22807     seh = scope.AddTopLevelSeqEntry(*entry);
22808     eval = validator.Validate(seh, options);
22809     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidFuzz",
22810         "Should not specify 'space to right' at last position of non-circular sequence"));
22811     //AddChromosomeNoLocation(expected_errors, entry);
22812     CheckErrors(*eval, expected_errors);
22813 
22814     CLEAR_ERRORS
22815 
22816 }
22817 
22818 
BOOST_FIXTURE_TEST_CASE(Test_VR_433,CGenBankFixture)22819 BOOST_FIXTURE_TEST_CASE(Test_VR_433, CGenBankFixture)
22820 {
22821     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
22822     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetId().SetGenbank().SetAccession("AY123456");
22823     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetFrom(0);
22824     entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc().SetInt().SetTo(11);
22825     unit_test_util::SetTech(entry, CMolInfo::eTech_wgs);
22826 
22827     STANDARD_SETUP
22828 
22829     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
22830         "FarLocationExcludesFeatures",
22831         "Scaffold points to some but not all of gb|AY123456|, excluded portion contains features"));
22832     AddChromosomeNoLocation(expected_errors, entry);
22833     eval = validator.Validate(seh, options);
22834     CheckErrors(*eval, expected_errors);
22835     CLEAR_ERRORS
22836 
22837     // suppress error if RefSeq
22838     scope.RemoveTopLevelSeqEntry(seh);
22839     entry->SetSeq().SetId().front()->SetOther().SetAccession("NC_00000001");
22840     CRef<CSeqdesc> biosample(new CSeqdesc());
22841     biosample->SetUser().SetType().SetStr("DBLink");
22842     CRef<CUser_field> f(new CUser_field());
22843     f->SetLabel().SetStr("BioSample");
22844     f->SetData().SetStr("SAME0001");
22845     biosample->SetUser().SetData().push_back(f);
22846     CRef<CUser_field> f2(new CUser_field());
22847     f2->SetLabel().SetStr("BioProject");
22848     f2->SetData().SetStrs().push_back("PRJNA12345");
22849     biosample->SetUser().SetData().push_back(f2);
22850     entry->SetSeq().SetDescr().Set().push_back(biosample);
22851 
22852     seh = scope.AddTopLevelSeqEntry(*entry);
22853     AddChromosomeNoLocation(expected_errors, entry);
22854     eval = validator.Validate(seh, options);
22855     CheckErrors(*eval, expected_errors);
22856 
22857     CLEAR_ERRORS
22858 }
22859 
22860 
BOOST_FIXTURE_TEST_CASE(Test_VR_708,CGenBankFixture)22861 BOOST_FIXTURE_TEST_CASE(Test_VR_708, CGenBankFixture)
22862 {
22863     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22864     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_chromosome, "");
22865     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_chromosome, "_abc");
22866     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_linkage_group, "*123");
22867 
22868     STANDARD_SETUP
22869 
22870     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
22871         "BadPlasmidChromosomeLinkageName",
22872         "Problematic plasmid/chromosome/linkage group name '_abc'"));
22873     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
22874         "BadPlasmidChromosomeLinkageName",
22875         "Problematic plasmid/chromosome/linkage group name '*123'"));
22876     eval = validator.Validate(seh, options);
22877     CheckErrors(*eval, expected_errors);
22878     CLEAR_ERRORS
22879 }
22880 
22881 
BOOST_AUTO_TEST_CASE(Test_TM_145)22882 BOOST_AUTO_TEST_CASE(Test_TM_145)
22883 {
22884     string host = "Rhesus monkey";
22885     string error_msg;
22886 
22887     BOOST_CHECK_EQUAL("Rhesus monkey", FixSpecificHost("Rhesus monkey"));
22888     BOOST_CHECK_EQUAL(true, IsSpecificHostValid("Rhesus monkey", error_msg));
22889 
22890 
22891     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22892     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Rhesus monkey");
22893 
22894     STANDARD_SETUP
22895 
22896     //AddChromosomeNoLocation(expected_errors, entry);
22897     eval = validator.Validate(seh, options);
22898     CheckErrors(*eval, expected_errors);
22899 
22900     CLEAR_ERRORS
22901 
22902 }
22903 
22904 #if 0
22905 // commented out for now
22906 BOOST_AUTO_TEST_CASE(Test_VR_723)
22907 {
22908     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
22909     CRef<CBioSource> src;
22910     NON_CONST_ITERATE(CBioseq::TDescr::Tdata, it, entry->SetSeq().SetDescr().Set()) {
22911         if ((*it)->IsSource()) {
22912             src.Reset(&((*it)->SetSource()));
22913         }
22914     }
22915     COrgName::C_Name& orgname = src->SetOrg().SetOrgname().SetName();
22916     STANDARD_SETUP
22917 
22918     // binomial
22919     orgname.SetBinomial().SetGenus("Sebaea");
22920     orgname.SetBinomial().SetSpecies("microphylla");
22921     eval = validator.Validate(seh, options);
22922     CheckErrors(*eval, expected_errors);
22923 
22924     orgname.SetBinomial().SetGenus("x");
22925     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
22926         "BioSourceInconsistency",
22927         "Taxname does not match orgname ('Sebaea microphylla', 'x microphylla')"));
22928     eval = validator.Validate(seh, options);
22929     CheckErrors(*eval, expected_errors);
22930 
22931     orgname.SetBinomial().SetSpecies("y");
22932     expected_errors[0]->SetErrMsg("Taxname does not match orgname ('Sebaea microphylla', 'x y')");
22933     eval = validator.Validate(seh, options);
22934     CheckErrors(*eval, expected_errors);
22935 
22936     orgname.SetBinomial().SetSubspecies("z");
22937     expected_errors[0]->SetErrMsg("Taxname does not match orgname ('Sebaea microphylla', 'x y subsp. z')");
22938     eval = validator.Validate(seh, options);
22939     CheckErrors(*eval, expected_errors);
22940 
22941     // virus
22942     orgname.SetVirus("x");
22943     expected_errors[0]->SetErrMsg("Taxname does not match orgname ('Sebaea microphylla', 'x')");
22944     eval = validator.Validate(seh, options);
22945     CheckErrors(*eval, expected_errors);
22946 
22947     CLEAR_ERRORS
22948     orgname.SetVirus("Sebaea microphylla");
22949     eval = validator.Validate(seh, options);
22950     CheckErrors(*eval, expected_errors);
22951 
22952     // hybrid
22953     CRef<COrgName> org1(new COrgName());
22954     org1->SetName().SetBinomial().SetSpecies("z");
22955     org1->SetName().SetBinomial().SetGenus("x");
22956     CRef<COrgName> org2(new COrgName());
22957     org2->SetName().SetBinomial().SetGenus("y");
22958     org2->SetName().SetBinomial().SetSpecies("z");
22959     orgname.SetHybrid().Set().push_back(org1);
22960     orgname.SetHybrid().Set().push_back(org2);
22961     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
22962         "BioSourceInconsistency",
22963         "Taxname does not match orgname ('Sebaea microphylla', 'x z')"));
22964     eval = validator.Validate(seh, options);
22965     CheckErrors(*eval, expected_errors);
22966 
22967     org2->SetName().SetBinomial().SetGenus("Sebaea");
22968     org2->SetName().SetBinomial().SetSpecies("microphylla");
22969     CLEAR_ERRORS
22970     eval = validator.Validate(seh, options);
22971     CheckErrors(*eval, expected_errors);
22972 
22973     // named hybrid
22974     orgname.SetNamedhybrid().SetGenus("Sebaea");
22975     orgname.SetNamedhybrid().SetSpecies("microphylla");
22976     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
22977         "BioSourceInconsistency",
22978         "Taxname does not match orgname ('Sebaea microphylla', 'Sebaea x microphylla')"));
22979 
22980     eval = validator.Validate(seh, options);
22981     CheckErrors(*eval, expected_errors);
22982 
22983     CLEAR_ERRORS
22984     orgname.SetNamedhybrid().SetGenus("x");
22985     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
22986         "BioSourceInconsistency",
22987         "Taxname does not match orgname ('Sebaea microphylla', 'x x microphylla')"));
22988     eval = validator.Validate(seh, options);
22989     CheckErrors(*eval, expected_errors);
22990 
22991     // partial
22992     CRef<CTaxElement> elem1(new CTaxElement());
22993     elem1->SetFixed_level(CTaxElement::eFixed_level_class);
22994     elem1->SetName("x");
22995     orgname.SetPartial().Set().push_back(elem1);
22996     expected_errors[0]->SetErrMsg("Taxname does not match orgname ('Sebaea microphylla', 'x')");
22997     eval = validator.Validate(seh, options);
22998     CheckErrors(*eval, expected_errors);
22999 
23000     CRef<CTaxElement> elem2(new CTaxElement());
23001     elem2->SetFixed_level(CTaxElement::eFixed_level_family);
23002     elem2->SetName("Sebaea microphylla");
23003     orgname.SetPartial().Set().push_back(elem2);
23004     CLEAR_ERRORS
23005     eval = validator.Validate(seh, options);
23006     CheckErrors(*eval, expected_errors);
23007 
23008 }
23009 #endif
23010 
23011 
BOOST_AUTO_TEST_CASE(Test_VR_728)23012 BOOST_AUTO_TEST_CASE(Test_VR_728)
23013 {
23014     CRef<CSeq_entry> entry = BuildGoodSeq();
23015     entry->SetSeq().SetId().front()->SetGeneral().SetDb("NCBIFILE");
23016     entry->SetSeq().SetId().front()->SetGeneral().SetTag().SetStr("x");
23017 
23018     STANDARD_SETUP
23019 
23020     expected_errors.push_back(new CExpectedError("gnl|NCBIFILE|x", eDiag_Critical,
23021         "NoIdOnBioseq",
23022         "The only ids on this Bioseq will be stripped during ID load"));
23023     //AddChromosomeNoLocation(expected_errors, entry);
23024     eval = validator.Validate(seh, options);
23025     CheckErrors(*eval, expected_errors);
23026     CLEAR_ERRORS
23027 
23028     scope.RemoveTopLevelSeqEntry(seh);
23029     CRef<CSeq_id> other_id(new CSeq_id());
23030     other_id->SetLocal().SetStr("x");
23031     entry->SetSeq().SetId().push_back(other_id);
23032     seh = scope.AddTopLevelSeqEntry(*entry);
23033     //AddChromosomeNoLocation(expected_errors, entry);
23034     eval = validator.Validate(seh, options);
23035     CheckErrors(*eval, expected_errors);
23036 
23037     CLEAR_ERRORS
23038 
23039     scope.RemoveTopLevelSeqEntry(seh);
23040     CRef<CSeq_id> bankit(new CSeq_id());
23041     bankit->SetGeneral().SetDb("BankIt");
23042     bankit->SetGeneral().SetTag().SetStr("x");
23043     entry->SetSeq().SetId().push_back(bankit);
23044     CRef<CSeq_feat> misc = AddMiscFeature(entry);
23045     misc->SetLocation().SetInt().SetId().Assign(*bankit);
23046     seh = scope.AddTopLevelSeqEntry(*entry);
23047 
23048     expected_errors.push_back(new CExpectedError("lcl|x", eDiag_Critical,
23049         "BadSeqIdFormat",
23050         "Feature locations should not use Seq-ids that will be stripped during ID load"));
23051     //AddChromosomeNoLocation(expected_errors, entry);
23052 
23053     eval = validator.Validate(seh, options);
23054     CheckErrors(*eval, expected_errors);
23055     CLEAR_ERRORS
23056 }
23057 
23058 
BOOST_AUTO_TEST_CASE(Test_VR_733)23059 BOOST_AUTO_TEST_CASE(Test_VR_733)
23060 {
23061     CRef<CSeq_entry> entry = BuildGoodSeq();
23062     CRef<CSeq_feat> f = AddMiscFeature(entry);
23063     f->SetLocation().SetInt().SetStrand(eNa_strand_both);
23064 
23065     STANDARD_SETUP
23066 
23067     // expect no errors for misc_feat
23068     //AddChromosomeNoLocation(expected_errors, entry);
23069     eval = validator.Validate(seh, options);
23070     CheckErrors(*eval, expected_errors);
23071     CLEAR_ERRORS
23072 
23073     scope.RemoveTopLevelSeqEntry(seh);
23074     f->SetData().SetImp().SetKey("exon");
23075 
23076     seh = scope.AddTopLevelSeqEntry(*entry);
23077     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
23078         "BothStrands",
23079         "exon may not be on both (forward) strands"));
23080     //AddChromosomeNoLocation(expected_errors, entry);
23081     eval = validator.Validate(seh, options);
23082     CheckErrors(*eval, expected_errors);
23083     CLEAR_ERRORS
23084 
23085 }
23086 
23087 
23088 
TestOnePlasmid(const string & plasmid_name,bool expect_error)23089 void TestOnePlasmid(const string& plasmid_name, bool expect_error)
23090 {
23091     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23092     unit_test_util::SetGenome(entry, CBioSource::eGenome_plasmid);
23093     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_plasmid_name, plasmid_name);
23094     STANDARD_SETUP
23095 
23096     if (expect_error) {
23097         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadPlasmidChromosomeLinkageName",
23098                     "Problematic plasmid/chromosome/linkage group name '" + plasmid_name + "'"));
23099     }
23100     //AddChromosomeNoLocation(expected_errors, entry);
23101     eval = validator.Validate(seh, options);
23102     CheckErrors(*eval, expected_errors);
23103     CLEAR_ERRORS
23104 }
23105 
23106 
BOOST_AUTO_TEST_CASE(Test_VR_742)23107 BOOST_AUTO_TEST_CASE(Test_VR_742)
23108 {
23109     TestOnePlasmid("plasmid", true);
23110     TestOnePlasmid("Sebaea microphylla", true);
23111 
23112     // these values are ok
23113     TestOnePlasmid("megaplasmid", false);
23114     TestOnePlasmid("2micron", false);
23115     TestOnePlasmid("psomething", false);
23116     TestOnePlasmid("unnamed", false);
23117     TestOnePlasmid("unnamed2", false);
23118     TestOnePlasmid("unnamed234", false);
23119 }
23120 
23121 
BOOST_AUTO_TEST_CASE(Test_VR_751)23122 BOOST_AUTO_TEST_CASE(Test_VR_751)
23123 {
23124     BOOST_CHECK_EQUAL(IsLikelyTaxname("Convolvulus sindicus"), true);
23125     BOOST_CHECK_EQUAL(IsLikelyTaxname("Lasiurus scindicus"), true);
23126     BOOST_CHECK_EQUAL(IsLikelyTaxname("Atlantic white-sided dolphin"), false);
23127 }
23128 
23129 
BOOST_AUTO_TEST_CASE(Test_TripletEncodesStopCodon)23130 BOOST_AUTO_TEST_CASE(Test_TripletEncodesStopCodon)
23131 {
23132     CRef<CSeq_entry> entry = BuildGoodNucProtSet();
23133     CRef<CSeq_feat> cds = GetCDSFromGoodNucProtSet(entry);
23134     CRef<CSeq_entry> nuc = GetNucleotideSequenceFromGoodNucProtSet(entry);
23135 
23136     nuc->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("ATGCCCAGATAAACAGAGATATAATAAGGGATGCCCAGAAAAACAGAGATAAACTAAGGG");
23137     CRef<CSeq_id> id = nuc->SetSeq().SetId().front();
23138     // first two "introns" are stop codons, third is not
23139     CRef<CSeq_loc> int1(new CSeq_loc(*id, 0, 8));
23140     CRef<CSeq_loc> int2(new CSeq_loc(*id, 12, 20));
23141     CRef<CSeq_loc> int3(new CSeq_loc(*id, 24, 44));
23142     CRef<CSeq_loc> int4(new CSeq_loc(*id, 48, 59));
23143     cds->SetLocation().SetMix().Set().push_back(int1);
23144     cds->SetLocation().SetMix().Set().push_back(int2);
23145     cds->SetLocation().SetMix().Set().push_back(int3);
23146     cds->SetLocation().SetMix().Set().push_back(int4);
23147 
23148     STANDARD_SETUP
23149 
23150     vector<CRef<CSeq_loc> > nonsense = CCDSTranslationProblems::GetNonsenseIntrons(*cds, scope);
23151     BOOST_CHECK_EQUAL(nonsense.size(), (size_t)2);
23152     BOOST_CHECK_EQUAL(nonsense.front()->GetInt().GetFrom(), (size_t)9);
23153     BOOST_CHECK_EQUAL(nonsense.front()->GetInt().GetTo(), (size_t)11);
23154     BOOST_CHECK_EQUAL(nonsense.back()->GetInt().GetFrom(), (size_t)21);
23155     BOOST_CHECK_EQUAL(nonsense.back()->GetInt().GetTo(), (size_t)23);
23156 
23157     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "IntronIsStopCodon",
23158         "Triplet intron encodes stop codon"));
23159     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Critical, "IntronIsStopCodon",
23160         "Triplet intron encodes stop codon"));
23161     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "ShortExon", "Internal coding region exon is too short at position 13-21"));
23162     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "InternalStop", "2 internal stops. Genetic code [0]"));
23163     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "NoStop", "Missing stop codon"));
23164     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Error, "TransLen", "Given protein length [8] does not match translation length [17]"));
23165     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor", "Splice donor consensus (GT) not found after exon ending at position 9 of lcl|nuc"));
23166     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor", "Splice donor consensus (GT) not found after exon ending at position 21 of lcl|nuc"));
23167     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusDonor", "Splice donor consensus (GT) not found after exon ending at position 45 of lcl|nuc"));
23168     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor", "Splice acceptor consensus (AG) not found before exon starting at position 13 of lcl|nuc"));
23169     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "NotSpliceConsensusAcceptor", "Splice acceptor consensus (AG) not found before exon starting at position 25 of lcl|nuc"));
23170     //AddChromosomeNoLocation(expected_errors, entry);
23171     eval = validator.Validate(seh, options);
23172     CheckErrors(*eval, expected_errors);
23173     CLEAR_ERRORS
23174 }
23175 
BOOST_AUTO_TEST_CASE(VR_758)23176 BOOST_AUTO_TEST_CASE(VR_758)
23177 {
23178     // make protein
23179     CRef<objects::CBioseq> pseq(new objects::CBioseq());
23180     pseq->SetInst().SetMol(objects::CSeq_inst::eMol_aa);
23181     pseq->SetInst().SetRepr(objects::CSeq_inst::eRepr_delta);
23182     pseq->SetInst().SetExt().SetDelta().AddLiteral("MPRK", objects::CSeq_inst::eMol_aa);
23183     CRef<objects::CDelta_seq> gap_seg(new objects::CDelta_seq());
23184     gap_seg->SetLiteral().SetSeq_data().SetGap();
23185     gap_seg->SetLiteral().SetLength(10);
23186     pseq->SetInst().SetExt().SetDelta().Set().push_back(gap_seg);
23187     pseq->SetInst().SetExt().SetDelta().AddLiteral("TEIN", objects::CSeq_inst::eMol_aa);
23188     pseq->SetInst().SetLength(18);
23189 
23190     CRef<objects::CSeq_id> pid(new objects::CSeq_id());
23191     pid->SetLocal().SetStr("prot");
23192     pseq->SetId().push_back(pid);
23193 
23194     CRef<objects::CSeqdesc> mpdesc(new objects::CSeqdesc());
23195     mpdesc->SetMolinfo().SetBiomol(objects::CMolInfo::eBiomol_peptide);
23196     mpdesc->SetMolinfo().SetCompleteness(objects::CMolInfo::eCompleteness_complete);
23197     pseq->SetDescr().Set().push_back(mpdesc);
23198 
23199     CRef<objects::CSeq_entry> entry(new objects::CSeq_entry());
23200     entry->SetSeq(*pseq);
23201 
23202     AddGoodSource(entry);
23203     AddGoodPub(entry);
23204 
23205     CRef<objects::CSeq_feat> feat(new objects::CSeq_feat());
23206     feat->SetData().SetProt().SetName().push_back("fake protein name");
23207     feat->SetLocation().SetInt().SetId().SetLocal().SetStr("prot");
23208     feat->SetLocation().SetInt().SetFrom(0);
23209     feat->SetLocation().SetInt().SetTo(17);
23210     AddFeat(feat, entry);
23211 
23212     STANDARD_SETUP
23213 
23214     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "ProteinShouldNotHaveGaps", "Protein sequences should not have gaps"));
23215     //AddChromosomeNoLocation(expected_errors, entry);
23216     eval = validator.Validate(seh, options);
23217     CheckErrors(*eval, expected_errors);
23218 
23219     CLEAR_ERRORS
23220 }
23221 
23222 
CheckLocalId(const string & id,const string & badchar)23223 void CheckLocalId(const string& id, const string& badchar)
23224 {
23225     CRef<CSeq_entry> entry = BuildGoodSeq();
23226     entry->SetSeq().SetId().front()->SetLocal().SetStr(id);
23227     STANDARD_SETUP
23228 
23229     expected_errors.push_back(new CExpectedError("lcl|" + id, eDiag_Warning, "BadSeqIdFormat",
23230            "Bad character '" + badchar + "' in local ID '" + id + "'"));
23231     //AddChromosomeNoLocation(expected_errors, entry);
23232     eval = validator.Validate(seh, options);
23233     CheckErrors(*eval, expected_errors);
23234 
23235     CLEAR_ERRORS
23236 }
23237 
23238 
BOOST_AUTO_TEST_CASE(VR_V48)23239 BOOST_AUTO_TEST_CASE(VR_V48)
23240 {
23241     CheckLocalId("abc|def", "|");
23242 }
23243 
23244 
BOOST_AUTO_TEST_CASE(Test_IsDateInPast)23245 BOOST_AUTO_TEST_CASE(Test_IsDateInPast)
23246 {
23247     CRef<CDate> date(new CDate());
23248     BOOST_CHECK_EQUAL(IsDateInPast(*date), false);
23249 
23250     date.Reset(new CDate(CTime(CTime::eCurrent), CDate::ePrecision_day));
23251     BOOST_CHECK_EQUAL(IsDateInPast(*date), false);
23252     auto curr_day = date->GetStd().GetDay();
23253     if (curr_day < 28) {
23254         date->SetStd().SetDay(curr_day + 1);
23255         BOOST_CHECK_EQUAL(IsDateInPast(*date), false);
23256     }
23257     if (curr_day > 1) {
23258         date->SetStd().SetDay(curr_day - 1);
23259         BOOST_CHECK_EQUAL(IsDateInPast(*date), true);
23260     }
23261     date->SetStd().ResetDay();
23262     BOOST_CHECK_EQUAL(IsDateInPast(*date), false);
23263 
23264     auto curr_month = date->GetStd().GetMonth();
23265     if (curr_month < 11) {
23266         date->SetStd().SetMonth(curr_month + 1);
23267         BOOST_CHECK_EQUAL(IsDateInPast(*date), false);
23268     }
23269     if (curr_month != 0) {
23270         date->SetStd().SetMonth(curr_month - 1);
23271         BOOST_CHECK_EQUAL(IsDateInPast(*date), true);
23272     }
23273     date->SetStd().ResetMonth();
23274     BOOST_CHECK_EQUAL(IsDateInPast(*date), false);
23275 
23276     auto curr_year = date->GetStd().GetYear();
23277     date->SetStd().SetYear(curr_year + 1);
23278     BOOST_CHECK_EQUAL(IsDateInPast(*date), false);
23279     date->SetStd().SetYear(curr_year - 1);
23280     BOOST_CHECK_EQUAL(IsDateInPast(*date), true);
23281 }
23282 
23283 
AddYear(CDate & add_date)23284 void AddYear(CDate& add_date)
23285 {
23286     CTime t(add_date.GetStd().GetYear(), add_date.GetStd().GetMonth(), add_date.GetStd().GetDay());
23287     t.AddYear();
23288     CDate new_date(t);
23289     add_date.Assign(new_date);
23290 }
23291 
23292 
AddMonth(CDate & add_date)23293 void AddMonth(CDate& add_date)
23294 {
23295     CTime t(add_date.GetStd().GetYear(), add_date.GetStd().GetMonth(), add_date.GetStd().GetDay());
23296     t.AddMonth();
23297     CDate new_date(t);
23298     add_date.Assign(new_date);
23299 }
23300 
23301 
AddDay(CDate & add_date)23302 void AddDay(CDate& add_date)
23303 {
23304     CTime t(add_date.GetStd().GetYear(), add_date.GetStd().GetMonth(), add_date.GetStd().GetDay());
23305     t.AddDay();
23306     CDate new_date(t);
23307     add_date.Assign(new_date);
23308 }
23309 
23310 
BOOST_AUTO_TEST_CASE(VR_778)23311 BOOST_AUTO_TEST_CASE(VR_778)
23312 {
23313     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23314 
23315     // find sub pub and other pub
23316     CRef<CPub> subpub(NULL);
23317     NON_CONST_ITERATE(CBioseq::TDescr::Tdata, it, entry->SetSeq().SetDescr().Set()) {
23318         if ((*it)->IsPub()) {
23319             if ((*it)->GetPub().GetPub().Get().front()->IsSub()) {
23320                 subpub = (*it)->SetPub().SetPub().Set().front();
23321             }
23322         }
23323     }
23324 
23325     STANDARD_SETUP
23326 
23327     time_t time_now = time(NULL);
23328     CDate today(time_now);
23329     CDate future(time_now);
23330 
23331     AddYear(future);
23332     subpub->SetSub().SetDate().Assign(future);
23333     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadDate",
23334                               "Submission citation date is in the future"));
23335     //AddChromosomeNoLocation(expected_errors, entry);
23336     eval = validator.Validate(seh, options);
23337     CheckErrors (*eval, expected_errors);
23338 
23339     future.Assign(today);
23340     AddMonth(future);
23341     subpub->SetSub().SetDate().Assign(future);
23342     eval = validator.Validate(seh, options);
23343     CheckErrors(*eval, expected_errors);
23344 
23345     future.Assign(today);
23346     AddDay(future);
23347     subpub->SetSub().SetDate().Assign(future);
23348     eval = validator.Validate(seh, options);
23349     CheckErrors(*eval, expected_errors);
23350 
23351     CLEAR_ERRORS
23352 
23353     subpub->SetSub().SetDate().Assign(today);
23354     eval = validator.Validate(seh, options);
23355     //AddChromosomeNoLocation(expected_errors, entry);
23356     CheckErrors(*eval, expected_errors);
23357 
23358     CLEAR_ERRORS
23359 }
23360 
23361 
BOOST_AUTO_TEST_CASE(Test_InconsistentPseudogeneValue)23362 BOOST_AUTO_TEST_CASE(Test_InconsistentPseudogeneValue)
23363 {
23364     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23365 
23366     CRef<CSeq_feat> cds = unit_test_util::AddMiscFeature(entry);
23367     cds->SetData().SetCdregion();
23368     cds->ResetComment();
23369     cds->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("pseudogene", "unitary")));
23370 
23371     CRef<CSeq_feat> mrna = unit_test_util::AddMiscFeature(entry);
23372     mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
23373     mrna->ResetComment();
23374     mrna->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("pseudogene", "unitary")));
23375 
23376     CRef<CSeq_feat> gene = unit_test_util::AddMiscFeature(entry);
23377     gene->SetData().SetGene().SetLocus("x");
23378     gene->ResetComment();
23379     gene->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("pseudogene", "unitary")));
23380 
23381     // no errors, all features have matching pseudogene values
23382     STANDARD_SETUP
23383 
23384     eval = validator.Validate(seh, options);
23385     //AddChromosomeNoLocation(expected_errors, entry);
23386     CheckErrors(*eval, expected_errors);
23387 
23388     // no errors if cds has no pseudogene but mrna and gene do
23389     cds->ResetQual();
23390     eval = validator.Validate(seh, options);
23391     CheckErrors(*eval, expected_errors);
23392 
23393     // no errors if mrna and cds have no pseudogene but gene does
23394     mrna->ResetQual();
23395     eval = validator.Validate(seh, options);
23396     CheckErrors(*eval, expected_errors);
23397 
23398     // no errors if mrna has no pseudogene but cds and gene do
23399     cds->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("pseudogene", "unitary")));
23400     eval = validator.Validate(seh, options);
23401     CheckErrors(*eval, expected_errors);
23402 
23403     // error if cds has pseudogene but gene does not
23404     gene->ResetQual();
23405     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23406         "InconsistentPseudogeneValue",
23407         "CDS has pseudogene qualifier, gene does not"));
23408     eval = validator.Validate(seh, options);
23409     CheckErrors(*eval, expected_errors);
23410 
23411     // also error if mRNA has pseudogene but gene does not
23412     mrna->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("pseudogene", "unitary")));
23413     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23414         "InconsistentPseudogeneValue",
23415         "mRNA has pseudogene qualifier, gene does not"));
23416     eval = validator.Validate(seh, options);
23417     CheckErrors(*eval, expected_errors);
23418 
23419     CLEAR_ERRORS
23420 
23421     // different errors when pseudogene values conflict
23422     gene->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("pseudogene", "allelic")));
23423     mrna->SetQual().front()->SetVal("processed");
23424     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23425         "InconsistentPseudogeneValue",
23426         "Different pseudogene values on CDS (unitary) and gene (allelic)"));
23427     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23428         "InconsistentPseudogeneValue",
23429         "Different pseudogene values on mRNA (processed) and gene (allelic)"));
23430     //AddChromosomeNoLocation(expected_errors, entry);
23431 
23432     eval = validator.Validate(seh, options);
23433     CheckErrors(*eval, expected_errors);
23434 
23435     CLEAR_ERRORS
23436 
23437 }
23438 
23439 
BOOST_AUTO_TEST_CASE(Test_InvalidPseudoQualifier)23440 BOOST_AUTO_TEST_CASE(Test_InvalidPseudoQualifier)
23441 {
23442     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23443 
23444     CRef<CSeq_feat> gene = unit_test_util::AddMiscFeature(entry);
23445     gene->SetData().SetGene().SetLocus("x");
23446     gene->ResetComment();
23447     gene->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("pseudogene", "")));
23448 
23449     STANDARD_SETUP
23450 
23451     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23452         "InvalidPseudoQualifier",
23453         "/pseudogene value should not be empty"));
23454     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23455         "InvalidPunctuation",
23456         "Qualifier other than replace has just quotation marks"));
23457     //AddChromosomeNoLocation(expected_errors, entry);
23458     eval = validator.Validate(seh, options);
23459     CheckErrors(*eval, expected_errors);
23460 
23461     CLEAR_ERRORS
23462 
23463     gene->SetQual().front()->SetVal("abc");
23464     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23465         "InvalidPseudoQualifier",
23466         "/pseudogene value should not be 'abc'"));
23467     //AddChromosomeNoLocation(expected_errors, entry);
23468     eval = validator.Validate(seh, options);
23469     CheckErrors(*eval, expected_errors);
23470 
23471     CLEAR_ERRORS
23472 }
23473 
23474 
BOOST_AUTO_TEST_CASE(Test_InvalidRptUnitRange)23475 BOOST_AUTO_TEST_CASE(Test_InvalidRptUnitRange)
23476 {
23477     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23478 
23479     CRef<CSeq_feat> rpt = unit_test_util::AddMiscFeature(entry);
23480     rpt->SetData().SetImp().SetKey("repeat_region");
23481     rpt->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("rpt_unit_range", "x")));
23482 
23483     STANDARD_SETUP
23484 
23485     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23486         "InvalidRptUnitRange",
23487         "/rpt_unit_range is not a base range"));
23488     //AddChromosomeNoLocation(expected_errors, entry);
23489     eval = validator.Validate(seh, options);
23490     CheckErrors(*eval, expected_errors);
23491 
23492     rpt->SetQual().front()->SetVal("a..b");
23493     eval = validator.Validate(seh, options);
23494     CheckErrors(*eval, expected_errors);
23495 
23496     CLEAR_ERRORS
23497 
23498     rpt->SetQual().front()->SetVal("1..5");
23499     //AddChromosomeNoLocation(expected_errors, entry);
23500     eval = validator.Validate(seh, options);
23501     CheckErrors(*eval, expected_errors);
23502 
23503     CLEAR_ERRORS
23504 }
23505 
23506 
BOOST_AUTO_TEST_CASE(Test_InvalidRptUnitSeqCharacters)23507 BOOST_AUTO_TEST_CASE(Test_InvalidRptUnitSeqCharacters)
23508 {
23509     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23510 
23511     CRef<CSeq_feat> rpt = unit_test_util::AddMiscFeature(entry);
23512     rpt->SetData().SetImp().SetKey("repeat_region");
23513     rpt->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("rpt_unit_seq", "x..y")));
23514 
23515     STANDARD_SETUP
23516 
23517     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23518         "InvalidRptUnitSeqCharacters",
23519         "/rpt_unit_seq has illegal characters"));
23520     //AddChromosomeNoLocation(expected_errors, entry);
23521     eval = validator.Validate(seh, options);
23522     CheckErrors(*eval, expected_errors);
23523 
23524     CLEAR_ERRORS
23525 
23526     rpt->SetQual().front()->SetVal("(atgc)");
23527     //AddChromosomeNoLocation(expected_errors, entry);
23528     eval = validator.Validate(seh, options);
23529     CheckErrors(*eval, expected_errors);
23530 
23531     CLEAR_ERRORS
23532 }
23533 
23534 
BOOST_AUTO_TEST_CASE(Test_MismatchedAllele)23535 BOOST_AUTO_TEST_CASE(Test_MismatchedAllele)
23536 {
23537     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23538 
23539     CRef<CSeq_feat> rna = unit_test_util::AddMiscFeature(entry);
23540     rna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
23541     rna->SetData().SetRna().SetExt().SetName("16S ribosomal RNA");
23542     rna->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("allele", "x")));
23543     CRef<CSeq_feat> gene1 = unit_test_util::MakeGeneForFeature(rna);
23544     unit_test_util::AddFeat(gene1, entry);
23545     gene1->SetData().SetGene().SetAllele("y");
23546 
23547     STANDARD_SETUP
23548 
23549     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23550         "MismatchedAllele",
23551         "Mismatched allele qualifier on gene (y) and feature (x)"));
23552     //AddChromosomeNoLocation(expected_errors, entry);
23553     eval = validator.Validate(seh, options);
23554     CheckErrors(*eval, expected_errors);
23555 
23556     CLEAR_ERRORS
23557 }
23558 
23559 
BOOST_AUTO_TEST_CASE(Test_InvalidAlleleDuplicates)23560 BOOST_AUTO_TEST_CASE(Test_InvalidAlleleDuplicates)
23561 {
23562     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23563 
23564     CRef<CSeq_feat> rna = unit_test_util::AddMiscFeature(entry);
23565     rna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
23566     rna->SetData().SetRna().SetExt().SetName("16S ribosomal RNA");
23567     rna->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("allele", "x")));
23568     CRef<CSeq_feat> gene1 = unit_test_util::MakeGeneForFeature(rna);
23569     unit_test_util::AddFeat(gene1, entry);
23570     gene1->SetData().SetGene().SetAllele("x");
23571 
23572     STANDARD_SETUP
23573 
23574     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23575         "InvalidAlleleDuplicates",
23576         "Redundant allele qualifier (x) on gene and feature"));
23577     //AddChromosomeNoLocation(expected_errors, entry);
23578     eval = validator.Validate(seh, options);
23579     CheckErrors(*eval, expected_errors);
23580 
23581     CLEAR_ERRORS
23582 }
23583 
23584 
BOOST_AUTO_TEST_CASE(Test_InvalidOperonMatchesGene)23585 BOOST_AUTO_TEST_CASE(Test_InvalidOperonMatchesGene)
23586 {
23587     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23588     CRef<CSeq_feat> operon = unit_test_util::AddMiscFeature(entry);
23589     operon->SetData().SetImp().SetKey("operon");
23590     operon->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("operon", "x")));
23591 
23592     CRef<CSeq_feat> gene = unit_test_util::MakeGeneForFeature(operon);
23593     unit_test_util::AddFeat(gene, entry);
23594     gene->SetData().SetGene().SetLocus("x");
23595 
23596     STANDARD_SETUP
23597 
23598     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23599         "InvalidOperonMatchesGene",
23600         "Operon is same as gene - x"));
23601     //AddChromosomeNoLocation(expected_errors, entry);
23602     eval = validator.Validate(seh, options);
23603     CheckErrors(*eval, expected_errors);
23604 
23605     CLEAR_ERRORS
23606 
23607 }
23608 
23609 
BOOST_AUTO_TEST_CASE(Test_InvalidCompareRefSeqAccession)23610 BOOST_AUTO_TEST_CASE(Test_InvalidCompareRefSeqAccession)
23611 {
23612     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23613     entry->SetSeq().SetId().push_back(CRef<CSeq_id>(new CSeq_id("AY123456.1")));
23614     CRef<CSeq_feat> var = unit_test_util::AddMiscFeature(entry);
23615     var->SetData().SetImp().SetKey("variation");
23616     var->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("compare", "NC_000001.1")));
23617 
23618     STANDARD_SETUP
23619 
23620     expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error,
23621         "InvalidCompareRefSeqAccession",
23622         "RefSeq accession NC_000001.1 cannot be used for qualifier compare"));
23623     //AddChromosomeNoLocation(expected_errors, "gb|AY123456.1|");
23624     eval = validator.Validate(seh, options);
23625     CheckErrors(*eval, expected_errors);
23626 
23627     CLEAR_ERRORS
23628 
23629 }
23630 
23631 
BOOST_AUTO_TEST_CASE(Test_InvalidCompareMissingVersion)23632 BOOST_AUTO_TEST_CASE(Test_InvalidCompareMissingVersion)
23633 {
23634     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23635     entry->SetSeq().SetId().push_back(CRef<CSeq_id>(new CSeq_id("AY123456.1")));
23636     CRef<CSeq_feat> var = unit_test_util::AddMiscFeature(entry);
23637     var->SetData().SetImp().SetKey("variation");
23638     var->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("compare", "NC_000001")));
23639 
23640     STANDARD_SETUP
23641 
23642     expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error,
23643         "InvalidCompareMissingVersion",
23644         "NC_000001 accession missing version for qualifier compare"));
23645     //AddChromosomeNoLocation(expected_errors, "gb|AY123456.1|");
23646     eval = validator.Validate(seh, options);
23647     CheckErrors(*eval, expected_errors);
23648 
23649     CLEAR_ERRORS
23650 
23651 }
23652 
23653 
BOOST_AUTO_TEST_CASE(Test_InvalidCompareBadAccession)23654 BOOST_AUTO_TEST_CASE(Test_InvalidCompareBadAccession)
23655 {
23656     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23657     entry->SetSeq().SetId().push_back(CRef<CSeq_id>(new CSeq_id("AY123456.1")));
23658     CRef<CSeq_feat> var = unit_test_util::AddMiscFeature(entry);
23659     var->SetData().SetImp().SetKey("variation");
23660     var->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("compare", "x_y")));
23661 
23662     STANDARD_SETUP
23663 
23664     expected_errors.push_back(new CExpectedError("gb|AY123456.1|", eDiag_Error,
23665         "InvalidCompareBadAccession",
23666         "x_y is not a legal accession for qualifier compare"));
23667     //AddChromosomeNoLocation(expected_errors, "gb|AY123456.1|");
23668     eval = validator.Validate(seh, options);
23669     CheckErrors(*eval, expected_errors);
23670 
23671     CLEAR_ERRORS
23672 
23673 }
23674 
23675 
BOOST_AUTO_TEST_CASE(Test_RegulatoryClassOtherNeedsNote)23676 BOOST_AUTO_TEST_CASE(Test_RegulatoryClassOtherNeedsNote)
23677 {
23678     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23679     CRef<CSeq_feat> reg = unit_test_util::AddMiscFeature(entry);
23680     reg->SetData().SetImp().SetKey("regulatory");
23681     CRef<CGb_qual> qual(new CGb_qual("regulatory_class", "other"));
23682     reg->SetQual().push_back(qual);
23683 
23684     STANDARD_SETUP
23685 
23686     // first check ok because recomb has comment
23687     //AddChromosomeNoLocation(expected_errors, entry);
23688     eval = validator.Validate(seh, options);
23689     CheckErrors(*eval, expected_errors);
23690 
23691     // error because 'other' and no comment
23692     reg->ResetComment();
23693     eval = validator.Validate(seh, options);
23694     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
23695         "RegulatoryClassOtherNeedsNote",
23696         "The regulatory_class 'other' is missing the required /note"));
23697     CheckErrors(*eval, expected_errors);
23698 
23699     CLEAR_ERRORS
23700 }
23701 
23702 
BOOST_AUTO_TEST_CASE(Test_UnparsedtRNAAnticodon)23703 BOOST_AUTO_TEST_CASE(Test_UnparsedtRNAAnticodon)
23704 {
23705     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23706     CRef<CSeq_feat> trna = unit_test_util::AddMiscFeature(entry);
23707     trna->SetData().SetRna().SetType(CRNA_ref::eType_tRNA);
23708     trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetNcbieaa('A');
23709     trna->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("anticodon", "other")));
23710 
23711     STANDARD_SETUP
23712 
23713     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
23714         "UnparsedtRNAAnticodon",
23715         "Unparsed anticodon qualifier in tRNA"));
23716     //AddChromosomeNoLocation(expected_errors, entry);
23717     eval = validator.Validate(seh, options);
23718     CheckErrors(*eval, expected_errors);
23719 
23720     CLEAR_ERRORS
23721 }
23722 
23723 
BOOST_AUTO_TEST_CASE(Test_UnparsedtRNAProduct)23724 BOOST_AUTO_TEST_CASE(Test_UnparsedtRNAProduct)
23725 {
23726     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23727     CRef<CSeq_feat> trna = unit_test_util::AddMiscFeature(entry);
23728     trna->SetData().SetRna().SetType(CRNA_ref::eType_tRNA);
23729     trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetNcbieaa('A');
23730     trna->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("product", "other")));
23731 
23732     STANDARD_SETUP
23733 
23734     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
23735         "UnparsedtRNAProduct",
23736         "Unparsed product qualifier in tRNA"));
23737     //AddChromosomeNoLocation(expected_errors, entry);
23738     eval = validator.Validate(seh, options);
23739     CheckErrors(*eval, expected_errors);
23740 
23741     CLEAR_ERRORS
23742 }
23743 
23744 
BOOST_AUTO_TEST_CASE(Test_rRNADoesNotHaveProduct)23745 BOOST_AUTO_TEST_CASE(Test_rRNADoesNotHaveProduct)
23746 {
23747     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23748     CRef<CSeq_feat> rrna = unit_test_util::AddMiscFeature(entry);
23749     rrna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
23750 
23751     STANDARD_SETUP
23752 
23753     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
23754         "rRNADoesNotHaveProduct",
23755         "rRNA has no name"));
23756     //AddChromosomeNoLocation(expected_errors, entry);
23757     eval = validator.Validate(seh, options);
23758     CheckErrors(*eval, expected_errors);
23759 
23760     CLEAR_ERRORS
23761 }
23762 
23763 
BOOST_AUTO_TEST_CASE(Test_MobileElementInvalidQualifier)23764 BOOST_AUTO_TEST_CASE(Test_MobileElementInvalidQualifier)
23765 {
23766     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23767     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
23768     misc->SetData().SetImp().SetKey("repeat_region");
23769     misc->AddQualifier("mobile_element", "foo");
23770 
23771     STANDARD_SETUP
23772 
23773     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MobileElementInvalidQualifier",
23774         "foo is not a legal value for qualifier mobile_element"));
23775     //AddChromosomeNoLocation(expected_errors, entry);
23776     eval = validator.Validate(seh, options);
23777     CheckErrors(*eval, expected_errors);
23778 
23779     CLEAR_ERRORS
23780 
23781     misc->SetQual().front()->SetVal("integron");
23782     //AddChromosomeNoLocation(expected_errors, entry);
23783     eval = validator.Validate(seh, options);
23784     CheckErrors(*eval, expected_errors);
23785 
23786     CLEAR_ERRORS
23787 }
23788 
23789 
BOOST_AUTO_TEST_CASE(Test_InvalidReplace)23790 BOOST_AUTO_TEST_CASE(Test_InvalidReplace)
23791 {
23792     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23793     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
23794     misc->SetData().SetImp().SetKey("misc_difference");
23795     misc->AddQualifier("replace", "123");
23796 
23797     STANDARD_SETUP
23798 
23799     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidReplace",
23800         "123 is not a legal value for qualifier replace - should only be composed of acgtmrwsykvhdbn nucleotide bases"));
23801     //AddChromosomeNoLocation(expected_errors, entry);
23802     eval = validator.Validate(seh, options);
23803     CheckErrors(*eval, expected_errors);
23804 
23805     CLEAR_ERRORS
23806 
23807     misc->SetQual().front()->SetVal("aaccttgg");
23808     eval = validator.Validate(seh, options);
23809     //AddChromosomeNoLocation(expected_errors, entry);
23810     CheckErrors(*eval, expected_errors);
23811 
23812     CLEAR_ERRORS
23813 
23814     scope.RemoveTopLevelSeqEntry(seh);
23815     entry = unit_test_util::BuildGoodNucProtSet();
23816     CRef<CSeq_entry> prot = unit_test_util::GetProteinSequenceFromGoodNucProtSet(entry);
23817 
23818     misc = unit_test_util::AddMiscFeature(prot, prot->GetSeq().GetLength() - 1);
23819     misc->SetData().SetImp().SetKey("misc_difference");
23820     misc->AddQualifier("replace", "123");
23821     seh = scope.AddTopLevelSeqEntry(*entry);
23822 
23823     expected_errors.push_back(new CExpectedError("lcl|prot", eDiag_Error, "InvalidReplace",
23824         "123 is not a legal value for qualifier replace - should only be composed of acdefghiklmnpqrstuvwy* amino acids"));
23825     //AddChromosomeNoLocation(expected_errors, entry);
23826     eval = validator.Validate(seh, options);
23827     CheckErrors(*eval, expected_errors);
23828 
23829     CLEAR_ERRORS
23830 
23831 }
23832 
23833 
BOOST_AUTO_TEST_CASE(Test_InvalidVariationReplace)23834 BOOST_AUTO_TEST_CASE(Test_InvalidVariationReplace)
23835 {
23836     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23837     CRef<CSeq_feat> misc = unit_test_util::AddMiscFeature(entry);
23838     misc->SetData().SetImp().SetKey("variation");
23839     misc->AddQualifier("replace", "123");
23840 
23841     STANDARD_SETUP
23842 
23843     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidVariationReplace",
23844         "123 is not a legal value for qualifier replace - should only be composed of acgt unambiguous nucleotide bases"));
23845     //AddChromosomeNoLocation(expected_errors, entry);
23846     eval = validator.Validate(seh, options);
23847     CheckErrors(*eval, expected_errors);
23848 
23849     CLEAR_ERRORS
23850 
23851     misc->SetQual().front()->SetVal("aaccttgg");
23852     //AddChromosomeNoLocation(expected_errors, entry);
23853     eval = validator.Validate(seh, options);
23854     CheckErrors(*eval, expected_errors);
23855 
23856     CLEAR_ERRORS
23857 }
23858 
23859 
BOOST_AUTO_TEST_CASE(Test_InvalidProductOnGene)23860 BOOST_AUTO_TEST_CASE(Test_InvalidProductOnGene)
23861 {
23862     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23863     CRef<CSeq_feat> gene = unit_test_util::AddMiscFeature(entry);
23864     gene->SetData().SetGene().SetLocus("x");
23865     gene->AddQualifier("product", "hypothetical protein");
23866 
23867     STANDARD_SETUP
23868 
23869     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Info, "InvalidProductOnGene",
23870         "A product qualifier is not used on a gene feature"));
23871     //AddChromosomeNoLocation(expected_errors, entry);
23872     eval = validator.Validate(seh, options);
23873     CheckErrors(*eval, expected_errors);
23874 
23875     CLEAR_ERRORS
23876 
23877 }
23878 
23879 
BOOST_AUTO_TEST_CASE(Test_InvalidCodonStart)23880 BOOST_AUTO_TEST_CASE(Test_InvalidCodonStart)
23881 {
23882     CRef<CSeq_entry> entry = unit_test_util::BuildGoodNucProtSet();
23883     CRef<CSeq_feat> cds = unit_test_util::GetCDSFromGoodNucProtSet(entry);
23884     cds->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("codon_start", "z")));
23885     STANDARD_SETUP
23886 
23887     expected_errors.push_back(new CExpectedError("lcl|nuc", eDiag_Warning, "InvalidCodonStart",
23888         "codon_start value should be 1, 2, or 3"));
23889     //AddChromosomeNoLocation(expected_errors, entry);
23890     eval = validator.Validate(seh, options);
23891     CheckErrors(*eval, expected_errors);
23892 
23893     CLEAR_ERRORS
23894 }
23895 
23896 
BOOST_FIXTURE_TEST_CASE(Test_InconsistentBioSources_ConLocation,CGenBankFixture)23897 BOOST_FIXTURE_TEST_CASE(Test_InconsistentBioSources_ConLocation, CGenBankFixture)
23898 {
23899     CRef<CSeq_entry> entry = unit_test_util::BuildGoodDeltaSeq();
23900     unit_test_util::SetGenome(entry, CBioSource::eGenome_apicoplast);
23901     CSeq_loc& l1 = entry->SetSeq().SetInst().SetExt().SetDelta().Set().front()->SetLoc();
23902     l1.SetInt().SetId().SetGenbank().SetAccession("AY123456");
23903     l1.SetInt().SetFrom(0);
23904     l1.SetInt().SetTo(99);
23905     CSeq_loc& l2 = entry->SetSeq().SetInst().SetExt().SetDelta().Set().back()->SetLoc();
23906     l2.SetInt().SetId().SetGenbank().SetAccession("AY123457");
23907     l2.SetInt().SetFrom(0);
23908     l2.SetInt().SetTo(99);
23909 
23910     entry->SetSeq().SetInst().SetLength(210);
23911 
23912     STANDARD_SETUP
23913 
23914     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "InconsistentBioSources_ConLocation",
23915         "Genome difference between parent and component"));
23916     //AddChromosomeNoLocation(expected_errors, entry);
23917     eval = validator.Validate(seh, options);
23918     CheckErrors(*eval, expected_errors);
23919 
23920     CLEAR_ERRORS
23921 }
23922 
23923 
TestOverlappingRNAFeatures(const CSeq_loc & loc1,const CSeq_loc & loc2,bool expect_err)23924 void TestOverlappingRNAFeatures(const CSeq_loc& loc1, const CSeq_loc& loc2, bool expect_err)
23925 {
23926     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
23927     CRef<CSeq_feat> rrna = unit_test_util::AddMiscFeature(entry);
23928     rrna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
23929     rrna->SetData().SetRna().SetExt().SetName("16S ribosomal RNA");
23930     rrna->SetLocation().Assign(loc1);
23931 
23932     CRef<CSeq_feat> trna = unit_test_util::AddMiscFeature(entry);
23933     trna->SetData().SetRna().SetType(CRNA_ref::eType_tRNA);
23934     trna->SetData().SetRna().SetExt().SetTRNA().SetAa().SetIupacaa('A');
23935     trna->SetLocation().Assign(loc2);
23936 
23937     STANDARD_SETUP
23938 
23939     if (expect_err) {
23940         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadRRNAcomponentOverlapTRNA",
23941                                  "tRNA-rRNA overlap"));
23942     }
23943     //AddChromosomeNoLocation(expected_errors, entry);
23944     eval = validator.Validate(seh, options);
23945     CheckErrors(*eval, expected_errors);
23946 
23947     CLEAR_ERRORS
23948 }
23949 
23950 
BOOST_AUTO_TEST_CASE(Test_BADRRNAcomponentOverlapTRNA)23951 BOOST_AUTO_TEST_CASE(Test_BADRRNAcomponentOverlapTRNA)
23952 {
23953     CRef<CSeq_loc> loc1(new CSeq_loc());
23954     loc1->SetInt().SetId().SetLocal().SetStr("good");
23955     loc1->SetInt().SetFrom(0);
23956     loc1->SetInt().SetTo(10);
23957 
23958     CRef<CSeq_loc> loc2(new CSeq_loc());
23959     loc2->Assign(*loc1);
23960 
23961     TestOverlappingRNAFeatures(*loc1, *loc2, true);
23962 
23963     loc2->SetInt().SetFrom(6);
23964     loc2->SetInt().SetTo(16);
23965     TestOverlappingRNAFeatures(*loc1, *loc2, true);
23966 
23967     loc2->SetInt().SetFrom(7);
23968     loc2->SetInt().SetTo(17);
23969     TestOverlappingRNAFeatures(*loc1, *loc2, false);
23970 
23971     loc2->SetInt().SetFrom(11);
23972     loc2->SetInt().SetTo(17);
23973     TestOverlappingRNAFeatures(*loc1, *loc2, false);
23974 
23975 }
23976 
23977 
BOOST_AUTO_TEST_CASE(Test_VR_796)23978 BOOST_AUTO_TEST_CASE(Test_VR_796)
23979 {
23980     const string cMitoMezoMsg = "Mitochondrial Metazoan sequences should be less than 65000 bp";
23981 
23982     CRef<CSeq_entry> entry = BuildGoodSeq();
23983     SetLineage(entry, "Metazoan");
23984     SetGenome(entry, CBioSource::eGenome_mitochondrion);
23985     entry->SetSeq().SetInst().SetTopology(CSeq_inst::eTopology_circular);
23986     entry->SetSeq().SetInst().SetLength(66000);
23987     SetCompleteness(entry, CMolInfo::eCompleteness_complete);
23988     STANDARD_SETUP
23989     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
23990         "MitoMetazoanTooLong", cMitoMezoMsg));
23991     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
23992         "SeqDataLenWrong", "Bioseq.seq_data too short [60] for given length [66000]"));
23993     eval = validator.Validate(seh, options);
23994     CheckErrors(*eval, expected_errors);
23995 
23996     CLEAR_ERRORS
23997 
23998 
23999     // for RW-991
24000     scope.RemoveTopLevelSeqEntry(seh);
24001     entry->SetSeq().SetInst().SetLength(64000);
24002     seh = scope.AddTopLevelSeqEntry(*entry);
24003     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
24004         "SeqDataLenWrong", "Bioseq.seq_data too short [60] for given length [64000]"));
24005     eval = validator.Validate(seh, options);
24006     CheckErrors(*eval, expected_errors);
24007 
24008     CLEAR_ERRORS
24009 
24010 }
24011 
24012 
24013 
MakeSmallGenomeSetNucId(size_t num)24014 CRef<CSeq_id> MakeSmallGenomeSetNucId(size_t num)
24015 {
24016     CRef<CSeq_id> n1(new CSeq_id());
24017     n1->SetLocal().SetStr("nuc_" + NStr::NumericToString(num + 1));
24018     return n1;
24019 }
24020 
24021 
AddGeneticCode(CSeq_feat & cds,CGenetic_code::C_E::TId code_id)24022 void AddGeneticCode(CSeq_feat& cds, CGenetic_code::C_E::TId code_id)
24023 {
24024     CRef< CGenetic_code::C_E > ce(new CGenetic_code::C_E);
24025     ce->SetId(code_id);
24026     CRef<CGenetic_code> code(new CGenetic_code());
24027     code->Set().push_back(ce);
24028     cds.SetData().SetCdregion().SetCode(*code);
24029 }
24030 
24031 
BuildSmallGenomeSet(size_t num_np)24032 CRef<CSeq_entry> BuildSmallGenomeSet(size_t num_np)
24033 {
24034     CRef<CSeq_entry> entry(new CSeq_entry());
24035     entry->SetSet().SetClass(CBioseq_set::eClass_small_genome_set);
24036 
24037     for (size_t i = 0; i < num_np; i++) {
24038         CRef<CSeq_entry> np1 = BuildGoodNucProtSet();
24039         CRef<CSeq_feat> cds = GetCDSFromGoodNucProtSet(np1);
24040         AddGeneticCode(*cds, 11);
24041         unit_test_util::SetGenome(np1, CBioSource::eGenome_chloroplast);
24042         CRef<CSeq_id> n1 = MakeSmallGenomeSetNucId(i);
24043         unit_test_util::ChangeNucProtSetNucId(np1, n1);
24044         CRef<CSeq_id> p1(new CSeq_id());
24045         p1->SetLocal().SetStr("prot_" + NStr::NumericToString(i + 1));
24046         ChangeNucProtSetProteinId(np1, p1);
24047         entry->SetSet().SetSeq_set().push_back(np1);
24048     }
24049     return entry;
24050 }
24051 
24052 
AddCdregionToSmallGenomeSet(CRef<CSeq_entry> entry,size_t cdr1_num,size_t cdr2_num,size_t cdr_pos,size_t p_pos)24053 void AddCdregionToSmallGenomeSet(CRef<CSeq_entry> entry, size_t cdr1_num, size_t cdr2_num, size_t cdr_pos, size_t p_pos)
24054 {
24055     CRef<CSeq_feat> cdregion(new CSeq_feat());
24056     AddGeneticCode(*cdregion, 11);
24057     CRef<CSeq_loc> loc1(new CSeq_loc());
24058     loc1->SetInt().SetFrom(0);
24059     loc1->SetInt().SetTo(10);
24060     CRef<CSeq_id> n1 = MakeSmallGenomeSetNucId(cdr1_num);
24061     loc1->SetInt().SetId().Assign(*n1);
24062     CRef<CSeq_loc> loc2(new CSeq_loc());
24063     loc2->SetInt().SetFrom(11);
24064     loc2->SetInt().SetTo(26);
24065     CRef<CSeq_id> n2 = MakeSmallGenomeSetNucId(cdr2_num);
24066     loc2->SetInt().SetId().Assign(*n2);
24067 
24068     cdregion->SetLocation().SetMix().Set().push_back(loc1);
24069     cdregion->SetLocation().SetMix().Set().push_back(loc2);
24070 
24071     CRef<CSeq_entry> prot = unit_test_util::MakeProteinForGoodNucProtSet("special_prot");
24072     cdregion->SetProduct().SetWhole().Assign(*(prot->GetSeq().GetId().front()));
24073 
24074     auto it = entry->SetSet().SetSeq_set().begin();
24075     size_t offset = 1;
24076     while (it != entry->SetSet().SetSeq_set().end()) {
24077         if (offset == cdr_pos) {
24078             (*it)->SetSet().SetAnnot().front()->SetData().SetFtable().push_back(cdregion);
24079         }
24080         if (offset == p_pos) {
24081             (*it)->SetSet().SetSeq_set().push_back(prot);
24082         }
24083         it++;
24084         offset++;
24085     }
24086 }
24087 
24088 
24089 // If we have a small genome set, then a feature could legitimately
24090 // have a location with intervals on multiple sequences.
24091 // This should not trigger the CDSproductPackagingProblem error as long
24092 // as the protein sequence is packaged in the same nuc-prot set as one
24093 // of the nucleotide sequences that the coding region is located on
BOOST_AUTO_TEST_CASE(Test_GB_7601)24094 BOOST_AUTO_TEST_CASE(Test_GB_7601)
24095 {
24096     CRef<CSeq_entry> entry = BuildSmallGenomeSet(3);
24097 
24098     STANDARD_SETUP
24099 
24100     // no errors with no trans-spliced coding region
24101     eval = validator.Validate(seh, options);
24102     //AddChromosomeNoLocation(expected_errors, entry);
24103     CheckErrors(*eval, expected_errors);
24104 
24105     // first combination should not generate errors
24106     scope.RemoveTopLevelSeqEntry(seh);
24107     AddCdregionToSmallGenomeSet(entry, 0, 1, 1, 1);
24108     seh = scope.AddTopLevelSeqEntry(*entry);
24109     eval = validator.Validate(seh, options);
24110     CheckErrors(*eval, expected_errors);
24111 
24112     // second combination should not generate errors
24113     scope.RemoveTopLevelSeqEntry(seh);
24114     entry = BuildSmallGenomeSet(3);
24115     AddCdregionToSmallGenomeSet(entry, 0, 1, 2, 2);
24116     seh = scope.AddTopLevelSeqEntry(*entry);
24117     eval = validator.Validate(seh, options);
24118     CheckErrors(*eval, expected_errors);
24119 
24120     // third combination should produce an error because
24121     // protein on wrong sequence
24122     scope.RemoveTopLevelSeqEntry(seh);
24123     entry = BuildSmallGenomeSet(3);
24124     AddCdregionToSmallGenomeSet(entry, 0, 1, 2, 3);
24125     seh = scope.AddTopLevelSeqEntry(*entry);
24126 
24127     expected_errors.push_back(new CExpectedError("", eDiag_Warning,
24128         "CDSproductPackagingProblem",
24129         "Protein product not packaged in nuc-prot set with nucleotide in small genome set"));
24130 
24131     eval = validator.Validate(seh, options);
24132     CheckErrors(*eval, expected_errors);
24133 
24134 
24135     CLEAR_ERRORS
24136 }
24137 
24138 
BOOST_AUTO_TEST_CASE(Test_BadKeywordUnverified)24139 BOOST_AUTO_TEST_CASE(Test_BadKeywordUnverified)
24140 {
24141     CRef<CSeq_entry> entry = BuildGoodSeq();
24142     AddGenbankKeyword(entry, "BARCODE");
24143     SetTech(entry, CMolInfo::eTech_barcode);
24144     CRef<CSeqdesc> desc(new CSeqdesc());
24145     desc->SetUser().SetObjectType(CUser_object::eObjectType_Unverified);
24146     entry->SetSeq().SetDescr().Set().push_back(desc);
24147 
24148     STANDARD_SETUP
24149 
24150     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24151         "BadKeywordUnverified",
24152         "Sequence has both BARCODE and UNVERIFIED keywords"));
24153     //AddChromosomeNoLocation(expected_errors, entry);
24154 
24155     eval = validator.Validate(seh, options);
24156     CheckErrors(*eval, expected_errors);
24157 
24158     CLEAR_ERRORS
24159 }
24160 
24161 
BOOST_AUTO_TEST_CASE(Test_BINDoesNotMatch)24162 BOOST_AUTO_TEST_CASE(Test_BINDoesNotMatch)
24163 {
24164     CRef<CSeq_entry> entry = BuildGoodSeq();
24165     SetTaxname(entry, "BOLD bacterium sp. zz");
24166     CRef<CUser_object> sc = edit::CStructuredCommentField::MakeUserObject("International Barcode of Life (iBOL)Data");
24167     CRef<CSeqdesc> desc(new CSeqdesc());
24168     desc->SetUser().Assign(*sc);
24169 
24170     CRef<CUser_field> uf(new CUser_field());
24171     uf->SetLabel().SetStr("Barcode Index Number");
24172     uf->SetData().SetStr("xxx");
24173     desc->SetUser().SetData().push_back(uf);
24174     entry->SetSeq().SetDescr().Set().push_back(desc);
24175 
24176     STANDARD_SETUP
24177 
24178     // error was removed per VR-843
24179     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24180         "OrganismNotFound", "Organism not found in taxonomy database"));
24181     //AddChromosomeNoLocation(expected_errors, entry);
24182 
24183     eval = validator.Validate(seh, options);
24184     CheckErrors(*eval, expected_errors);
24185 
24186     CLEAR_ERRORS
24187 }
24188 
24189 
AddStrsField(CUser_object & user,const string & label,const string & val)24190 void AddStrsField(CUser_object& user, const string& label, const string& val)
24191 {
24192     CRef<CUser_field> uf(new CUser_field());
24193     uf->SetLabel().SetStr(label);
24194     uf->SetData().SetStrs().push_back(val);
24195     user.SetData().push_back(uf);
24196 }
24197 
24198 
BOOST_AUTO_TEST_CASE(Test_BadDBLink)24199 BOOST_AUTO_TEST_CASE(Test_BadDBLink)
24200 {
24201     CRef<CSeq_entry> entry = BuildGoodSeq();
24202     CRef<CSeqdesc> db1(new CSeqdesc());
24203     db1->SetUser().SetObjectType(CUser_object::eObjectType_DBLink);
24204     edit::CDBLink::SetAssembly(db1->SetUser(), "ZZZ");
24205     edit::CDBLink::SetBioProject(db1->SetUser(), "XXX");
24206     // for bad capitalization
24207     AddStrsField(db1->SetUser(), "Sequence read archive", "AAA");
24208     // for unknown field
24209     AddStrsField(db1->SetUser(), "unknown", "BBB");
24210 
24211     entry->SetSeq().SetDescr().Set().push_back(db1);
24212 
24213     CRef<CSeqdesc> db2(new CSeqdesc());
24214     db2->SetUser().SetObjectType(CUser_object::eObjectType_DBLink);
24215     edit::CDBLink::SetAssembly(db2->SetUser(), "YYY");
24216     entry->SetSeq().SetDescr().Set().push_back(db2);
24217 
24218     CRef<CSeqdesc> db3(new CSeqdesc());
24219     db3->SetUser().SetObjectType(CUser_object::eObjectType_DBLink);
24220     entry->SetSeq().SetDescr().Set().push_back(db3);
24221 
24222     STANDARD_SETUP
24223 
24224     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
24225         "MultipleDBLinkObjects", "3 DBLink user objects apply to a Bioseq"));
24226     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
24227         "DBLinkBadAssembly",
24228         "Assembly entries appear in 2 DBLink user objects"));
24229     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
24230         "DBLinkBadFormat",
24231         "Unrecognized entries appear in 1 DBLink user object"));
24232     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
24233         "DBLinkBadBioProject", "Bad BioProject format - XXX"));
24234     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
24235         "DBLinkBadSRAaccession", "Bad Sequence Read Archive format - AAA"));
24236     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Critical,
24237         "DBLinkBadCapitalization", "Bad DBLink capitalization - Sequence read archive"));
24238     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24239         "DBLinkMissingUserObject", "DBLink user object descriptor is empty"));
24240     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
24241         "UserObjectNoData", "User object with no data"));
24242     //AddChromosomeNoLocation(expected_errors, entry);
24243 
24244     eval = validator.Validate(seh, options);
24245     CheckErrors(*eval, expected_errors);
24246 
24247     CLEAR_ERRORS
24248 
24249 }
24250 
24251 
BOOST_AUTO_TEST_CASE(Test_DBLinkOnSet)24252 BOOST_AUTO_TEST_CASE(Test_DBLinkOnSet)
24253 {
24254     CRef<CSeq_entry> entry = BuildGoodEcoSet();
24255     CRef<CSeqdesc> db1(new CSeqdesc());
24256     db1->SetUser().SetObjectType(CUser_object::eObjectType_DBLink);
24257     edit::CDBLink::SetBioSample(db1->SetUser(), "SAMN1234");
24258 
24259     entry->SetSet().SetDescr().Set().push_back(db1);
24260 
24261     STANDARD_SETUP
24262 
24263     expected_errors.push_back(new CExpectedError("lcl|good1", eDiag_Error,
24264         "DBLinkOnSet", "DBLink user object should not be on this set"));
24265     //AddChromosomeNoLocation(expected_errors, entry);
24266     eval = validator.Validate(seh, options);
24267     CheckErrors(*eval, expected_errors);
24268 
24269     CLEAR_ERRORS
24270 }
24271 
24272 
BOOST_AUTO_TEST_CASE(Test_AssemblyGapFeatureProblem)24273 BOOST_AUTO_TEST_CASE(Test_AssemblyGapFeatureProblem)
24274 {
24275     CRef<CSeq_entry> entry = BuildGoodDeltaSeq();
24276     CRef<CSeq_feat> assembly_gap = AddMiscFeature(entry);
24277     assembly_gap->SetData().SetImp().SetKey("assembly_gap");
24278     assembly_gap->SetLocation().SetInt().SetFrom(12);
24279     assembly_gap->SetLocation().SetInt().SetTo(21);
24280     assembly_gap->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("estimated_length", "10")));
24281     assembly_gap->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("gap_type", "fragment")));
24282 
24283     STANDARD_SETUP
24284 
24285     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24286         "AssemblyGapFeatureProblem", "An assembly_gap feature should only be on a contig record"));
24287     //AddChromosomeNoLocation(expected_errors, entry);
24288     eval = validator.Validate(seh, options);
24289     CheckErrors(*eval, expected_errors);
24290 
24291     CLEAR_ERRORS
24292 }
24293 
24294 
MakeLeft(CSeq_loc & loc)24295 void MakeLeft(CSeq_loc& loc)
24296 {
24297     loc.SetInt().SetFrom(0);
24298     loc.SetInt().SetTo(5);
24299 }
24300 
MakeRight(CSeq_loc & loc,TSeqPos stop)24301 void MakeRight(CSeq_loc& loc, TSeqPos stop)
24302 {
24303     loc.SetInt().SetFrom(stop - 6);
24304     loc.SetInt().SetTo(stop - 1);
24305 }
24306 
TestUTRPair(bool add_gene,bool is_minus)24307 void TestUTRPair(bool add_gene, bool is_minus)
24308 {
24309     CRef<CSeq_entry> entry = BuildGoodSeq();
24310     TSeqPos stop = entry->GetSeq().GetLength() - 1;
24311     if (add_gene) {
24312         CRef<CSeq_feat> gene = AddMiscFeature(entry);
24313         gene->ResetComment();
24314         gene->SetData().SetGene().SetLocus("x");
24315         gene->SetLocation().SetInt().SetTo(stop);
24316         if (is_minus) {
24317             gene->SetLocation().SetInt().SetStrand(eNa_strand_minus);
24318         }
24319     }
24320 
24321     CRef<CSeq_feat> utr5 = AddMiscFeature(entry);
24322     utr5->ResetComment();
24323     utr5->SetData().SetImp().SetKey("5'UTR");
24324     if (is_minus) {
24325         MakeRight(utr5->SetLocation(), stop);
24326         utr5->SetLocation().SetInt().SetStrand(eNa_strand_minus);
24327     } else {
24328         MakeLeft(utr5->SetLocation());
24329     }
24330 
24331     CRef<CSeq_feat> utr3 = AddMiscFeature(entry);
24332     utr3->ResetComment();
24333     utr3->SetData().SetImp().SetKey("3'UTR");
24334     if (is_minus) {
24335         MakeLeft(utr3->SetLocation());
24336         utr3->SetLocation().SetInt().SetStrand(eNa_strand_minus);
24337     } else {
24338         MakeRight(utr3->SetLocation(), stop);
24339     }
24340 
24341     STANDARD_SETUP
24342 
24343     if (add_gene) {
24344         if (is_minus) {
24345             expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24346                     "NoCDSbetweenUTRs", "CDS not between 5'UTR and 3'UTR on minus strand"));
24347         } else {
24348             expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24349                     "NoCDSbetweenUTRs", "CDS not between 5'UTR and 3'UTR on plus strand"));
24350         }
24351     }
24352     //AddChromosomeNoLocation(expected_errors, entry);
24353 
24354     eval = validator.Validate(seh, options);
24355     CheckErrors(*eval, expected_errors);
24356 
24357     CLEAR_ERRORS
24358 }
24359 
BOOST_AUTO_TEST_CASE(Test_NoCDSbetweenUTRs)24360 BOOST_AUTO_TEST_CASE(Test_NoCDSbetweenUTRs)
24361 {
24362     TestUTRPair(false, false);
24363     TestUTRPair(false, true);
24364     TestUTRPair(true, false);
24365     TestUTRPair(true, true);
24366 }
24367 
BOOST_AUTO_TEST_CASE(Test_FormatBadSpecificHostAlternateName)24368 BOOST_AUTO_TEST_CASE(Test_FormatBadSpecificHostAlternateName)
24369 {
24370     CRef<CSeq_entry> entry = BuildGoodSeq();
24371     SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Gromphadorina portentosa");
24372 
24373     STANDARD_SETUP
24374 
24375     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadSpecificHost",
24376             "Specific host value is alternate name: Gromphadorina portentosa should be Gromphadorhina portentosa"));
24377     //AddChromosomeNoLocation(expected_errors, entry);
24378 
24379     eval = validator.Validate(seh, options);
24380     CheckErrors(*eval, expected_errors);
24381 
24382     CValidErrorFormat format(*objmgr);
24383     string val = format.FormatForSubmitterReport(*(eval->GetErrs().back()), scope);
24384     BOOST_CHECK_EQUAL(val, "lcl|good\tGromphadorina portentosa should be Gromphadorhina portentosa");
24385 
24386     CLEAR_ERRORS
24387 }
24388 
BOOST_FIXTURE_TEST_CASE(Test_VR_803,CGenBankFixture)24389 BOOST_FIXTURE_TEST_CASE(Test_VR_803, CGenBankFixture)
24390 {
24391     CRef<CSeq_entry> entry = BuildGoodSeq();
24392     CRef<CSeq_feat> rna = AddMiscFeature(entry);
24393     rna->SetData().SetRna().SetType(CRNA_ref::eType_rRNA);
24394     rna->SetData().SetRna().SetExt().SetName("23S ribosomal RNA");
24395     rna->SetProduct().SetWhole().SetGi(GI_CONST(507148189));
24396 
24397     STANDARD_SETUP
24398 
24399     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TranscriptLen",
24400             "Transcript length [11] less than (far) product length [3132], and tail < 95% polyA"));
24401     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "TranscriptMismatches",
24402         "There are 7 mismatches out of 11 bases between the transcript and (far) product sequence"));
24403     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "RnaProductMismatch",
24404         "Type of RNA does not match MolInfo of product Bioseq"));
24405     //AddChromosomeNoLocation(expected_errors, entry);
24406 
24407     eval = validator.Validate(seh, options);
24408     CheckErrors(*eval, expected_errors);
24409     CLEAR_ERRORS
24410 }
24411 
24412 
BOOST_AUTO_TEST_CASE(Test_ExceptionRequiresLocusTag)24413 BOOST_AUTO_TEST_CASE(Test_ExceptionRequiresLocusTag)
24414 {
24415     CRef<CSeq_entry> entry = BuildGoodSeq();
24416     CRef<CSeq_feat> gene = AddMiscFeature(entry);
24417     gene->SetData().SetGene().SetLocus("x");
24418     gene->SetExcept(true);
24419     gene->SetExcept_text("gene split at contig boundary");
24420 
24421     STANDARD_SETUP
24422 
24423     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24424         "ExceptionRequiresLocusTag",
24425         "Gene has split exception but no locus_tag"));
24426     //AddChromosomeNoLocation(expected_errors, entry);
24427     eval = validator.Validate(seh, options);
24428     CheckErrors(*eval, expected_errors);
24429     CLEAR_ERRORS
24430 }
24431 
24432 
MakeGeneious()24433 CRef<CSeq_submit> MakeGeneious()
24434 {
24435     CRef<CSeq_submit> ss(new CSeq_submit());
24436     ss->SetSub().SetTool("Geneious");
24437     CRef<CAuthor> author = unit_test_util::BuildGoodAuthor();
24438     ss->SetSub().SetCit().SetAuthors().SetNames().SetStd().push_back(author);
24439     ss->SetSub().SetCit().SetAuthors().SetAffil().SetStd().SetAffil("some affiliation");
24440     ss->SetSub().SetCit().SetAuthors().SetAffil().SetStd().SetCountry("Russia");
24441 
24442     ss->SetSub().SetCit().SetDate().SetStd().SetYear(2009);
24443     ss->SetSub().SetCit().SetDate().SetStd().SetMonth(12);
24444     ss->SetSub().SetCit().SetDate().SetStd().SetDay(31);
24445 
24446     return ss;
24447 }
24448 
24449 
BOOST_AUTO_TEST_CASE(Test_Geneious)24450 BOOST_AUTO_TEST_CASE(Test_Geneious)
24451 {
24452     CRef<CSeq_submit> ss = MakeGeneious();
24453 
24454     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
24455     CRef<CSeq_feat> gene = AddMiscFeature(entry);
24456     CRef<CSeq_loc> gene_loc = unit_test_util::MakeMixLoc(entry->SetSeq().SetId().front());
24457     gene_loc->SetMix().Set().front()->SetInt().SetFrom(0);
24458     gene_loc->SetMix().Set().front()->SetInt().SetTo(0);
24459     gene_loc->SetMix().Set().front()->SetInt().SetStrand(eNa_strand_minus);
24460     gene_loc->SetMix().Set().back()->SetInt().SetFrom(9);
24461     gene_loc->SetMix().Set().back()->SetInt().SetTo(10);
24462     gene->SetLocation().Assign(*gene_loc);
24463 
24464     ss->SetData().SetEntrys().push_back(entry);
24465 
24466     STANDARD_SETUP
24467 
24468     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "MixedStrand",
24469         "Location: Mixed strands in SeqLoc [(lcl|good:c1-1, 10-11)]"));
24470     //AddChromosomeNoLocation(expected_errors, entry);
24471 
24472     options |= CValidator::eVal_far_fetch_cds_products;
24473     eval = validator.Validate(*ss, &scope, options);
24474 
24475     CheckErrors (*eval, expected_errors);
24476 
24477     CLEAR_ERRORS
24478 
24479 }
24480 
24481 
24482 // From VR-793
24483 // A.	For segment, endogenous_virus_name:
24484 // 1. Must begin with a letter or number
24485 // 2. Spaces and other printable characters are permitted
24486 // 3. Must not be empty, must not be longer than 240 characters
24487 // B.	For chromosome, linkage_group and plasmid_name values:
24488 // 4.	Must begin with a letter or number
24489 // 5.	Must not be empty (not currently true), must not be longer than 32 characters
24490 // 6.	Must not contain <tab>
24491 // 7.	Spaces and other printable characters are permitted
24492 // 8.	Must not contain the word "plasmid" (ignoring case)
24493 // 9.	Must not contain the word "chromosome" (ignoring case)
24494 // 10.	Must not contain the phrase "linkage group" (ignoring case)
24495 // 11.	Must not contain the series of letters "chr" (ignoring case)
24496 // 12.	Must not contain the taxname (ignoring case)
24497 // 14.  Must not contain the genus (ignoring case)
24498 // 15. Must not contain the species (ignoring case)
24499 // 16. Must not contain the series of letters "chrm" (ignoring case)
24500 // 17. Must not contain the series of letters "chrom" (ignoring case)
24501 // 18. Must not contain the phrase "linkage-group" (ignoring case)
24502 // C.	For plasmid_name values:
24503 // 19. Exception- megaplasmid is legal
24504 // D.	plastid_name is obsolete so no value is legal.
24505 // 20. digits or numerals: Plastid name subsource contains unrecognized value
24506 // 21. organelle: Plastid name subsource chloroplast but not chloroplast location
24507 
TestOneReplicon(CSubSource::ESubtype subtype,const string & val,const string & err_code,EDiagSev sev,const string & msg)24508 void TestOneReplicon(CSubSource::ESubtype subtype, const string& val, const string& err_code, EDiagSev sev, const string& msg)
24509 {
24510     CRef<CSeq_entry> entry = BuildGoodSeq();
24511     for (auto it : entry->SetSeq().SetDescr().Set()) {
24512         if (it->IsSource()) {
24513             bool found = false;
24514             for (auto sit : it->SetSource().SetSubtype()) {
24515                 if (sit->GetSubtype() == subtype) {
24516                     sit->SetName(val);
24517                     found = true;
24518                     break;
24519                 }
24520             }
24521             if (!found) {
24522                 CRef<CSubSource> ss(new CSubSource(subtype, val));
24523                 it->SetSource().SetSubtype().push_back(ss);
24524             }
24525             if (subtype == CSubSource::eSubtype_plasmid_name) {
24526                 it->SetSource().SetGenome(CBioSource::eGenome_plasmid);
24527             }
24528         }
24529     }
24530 
24531     STANDARD_SETUP
24532 
24533     if (!NStr::IsBlank(err_code)) {
24534         expected_errors.push_back(new CExpectedError("lcl|good", sev, err_code, msg));
24535     }
24536 
24537     if (subtype == CSubSource::eSubtype_segment) {
24538         expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "NonViralSegment",
24539                 "Non-viral source feature should not have a segment qualifier"));
24540     }
24541 
24542     eval = validator.Validate(seh, options);
24543 
24544     CheckErrors (*eval, expected_errors);
24545 
24546     CLEAR_ERRORS
24547 }
24548 
24549 
TestAlwaysBadReplicon(const string & val)24550 void TestAlwaysBadReplicon(const string& val)
24551 {
24552     TestOneReplicon(CSubSource::eSubtype_chromosome, val, "BadPlasmidChromosomeLinkageName",
24553         eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + val + "'");
24554     TestOneReplicon(CSubSource::eSubtype_linkage_group, val, "BadPlasmidChromosomeLinkageName",
24555         eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + val + "'");
24556     TestOneReplicon(CSubSource::eSubtype_plasmid_name, val, "BadPlasmidChromosomeLinkageName",
24557         eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + val + "'");
24558     TestOneReplicon(CSubSource::eSubtype_segment, val, "BadTextInSourceQualifier", eDiag_Error, "segment value should start with letter or number");
24559     TestOneReplicon(CSubSource::eSubtype_endogenous_virus_name, val, "BadTextInSourceQualifier", eDiag_Error, "endogenous-virus-name value should start with letter or number");
24560 }
24561 
24562 
TestAlwaysGoodReplicon(const string & val)24563 void TestAlwaysGoodReplicon(const string& val)
24564 {
24565     TestOneReplicon(CSubSource::eSubtype_chromosome, val, "", eDiag_Info, "");
24566     TestOneReplicon(CSubSource::eSubtype_linkage_group, val, "", eDiag_Info, "");
24567     TestOneReplicon(CSubSource::eSubtype_plasmid_name, val, "", eDiag_Info, "");
24568     TestOneReplicon(CSubSource::eSubtype_segment, val, "", eDiag_Info, "");
24569     TestOneReplicon(CSubSource::eSubtype_endogenous_virus_name, val, "", eDiag_Info, "");
24570 }
24571 
24572 
TestRepliconTaxname(CSubSource::ESubtype subtype,bool expect_errs)24573 void TestRepliconTaxname(CSubSource::ESubtype subtype, bool expect_errs)
24574 {
24575     TestOneReplicon(subtype, "Sebaea microphylla",
24576         expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24577         eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'Sebaea microphylla'");
24578 
24579     TestOneReplicon(subtype, "Sebaea",
24580         expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24581         eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'Sebaea'");
24582 
24583     TestOneReplicon(subtype, "microphylla",
24584         expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24585         eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'microphylla'");
24586 
24587 }
24588 
TestRepliconForbiddenWords(CSubSource::ESubtype subtype,bool expect_errs)24589 void TestRepliconForbiddenWords(CSubSource::ESubtype subtype, bool expect_errs)
24590 {
24591     TestOneReplicon(subtype, "some CHROMOSOME",
24592         expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24593         eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'some CHROMOSOME'");
24594 
24595     TestOneReplicon(subtype, "linkage group x",
24596         expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24597         eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'linkage group x'");
24598 
24599     TestOneReplicon(subtype, "linkage-group x",
24600         expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24601         eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'linkage-group x'");
24602 
24603     TestOneReplicon(subtype, "linkage_group x",
24604         expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24605         eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'linkage_group x'");
24606 
24607     TestOneReplicon(subtype, "chry",
24608         expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24609         eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'chry'");
24610 
24611     TestOneReplicon(subtype, "chrm",
24612         expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24613         eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'chrm'");
24614 
24615     TestOneReplicon(subtype, "CHROM",
24616         expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24617         eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'CHROM'");
24618 
24619     TestOneReplicon(subtype, "PLASMID",
24620         expect_errs ? "BadPlasmidChromosomeLinkageName" : "",
24621         eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'PLASMID'");
24622 
24623 }
24624 
24625 
BOOST_AUTO_TEST_CASE(Test_VR_793)24626 BOOST_AUTO_TEST_CASE(Test_VR_793)
24627 {
24628     // blanks are always bad
24629     TestAlwaysBadReplicon("");
24630     TestOneReplicon(CSubSource::eSubtype_plastid_name, "", "BadPlastidName", eDiag_Warning, "Plastid name subsource contains unrecognized value");
24631     TestOneReplicon(CSubSource::eSubtype_transposon_name, "", "ObsoleteSourceQual", eDiag_Warning, "Transposon name and insertion sequence name are no longer legal qualifiers");
24632     TestOneReplicon(CSubSource::eSubtype_insertion_seq_name, "", "ObsoleteSourceQual", eDiag_Warning, "Transposon name and insertion sequence name are no longer legal qualifiers");
24633 
24634     // must start with letter or number
24635     TestAlwaysBadReplicon(".2");
24636 
24637     // unprintable characters bad
24638     TestAlwaysBadReplicon("a\tb");
24639 
24640     // just letters ok
24641     TestAlwaysGoodReplicon("x");
24642 
24643     // spaces ok
24644     TestAlwaysGoodReplicon("x y");
24645 
24646     const string kMoreThan240 = "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z";
24647     const string kMoreThan32 = "A B C D E F G H I J K L M N O P Q R S T U V W X Y Z";
24648 
24649     // segment
24650     TestOneReplicon(CSubSource::eSubtype_segment, kMoreThan32, "", eDiag_Info, "");
24651     TestOneReplicon(CSubSource::eSubtype_segment, kMoreThan240, "BadTextInSourceQualifier", eDiag_Error, "segment value should start with letter or number");
24652     TestRepliconTaxname(CSubSource::eSubtype_segment, false);
24653     TestRepliconForbiddenWords(CSubSource::eSubtype_segment, false);
24654 
24655     // endogenous virus name
24656     TestOneReplicon(CSubSource::eSubtype_endogenous_virus_name, kMoreThan32, "", eDiag_Info, "");
24657     TestOneReplicon(CSubSource::eSubtype_endogenous_virus_name, kMoreThan240, "BadTextInSourceQualifier", eDiag_Error, "endogenous-virus-name value should start with letter or number");
24658     TestRepliconTaxname(CSubSource::eSubtype_endogenous_virus_name, false);
24659     TestRepliconForbiddenWords(CSubSource::eSubtype_endogenous_virus_name, false);
24660 
24661     // chromosome
24662     TestOneReplicon(CSubSource::eSubtype_chromosome, kMoreThan32, "BadPlasmidChromosomeLinkageName",
24663         eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + kMoreThan32 + "'");
24664     TestOneReplicon(CSubSource::eSubtype_chromosome, kMoreThan240, "BadPlasmidChromosomeLinkageName",
24665         eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + kMoreThan240 + "'");
24666     TestOneReplicon(CSubSource::eSubtype_chromosome, "LG 123", "BadPlasmidChromosomeLinkageName",
24667         eDiag_Error, "Problematic plasmid/chromosome/linkage group name 'LG 123'");
24668     TestRepliconTaxname(CSubSource::eSubtype_chromosome, true);
24669     TestRepliconForbiddenWords(CSubSource::eSubtype_chromosome, true);
24670 
24671     // linkage-group
24672     TestOneReplicon(CSubSource::eSubtype_linkage_group, kMoreThan32, "BadPlasmidChromosomeLinkageName",
24673         eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + kMoreThan32 + "'");
24674     TestOneReplicon(CSubSource::eSubtype_linkage_group, kMoreThan240, "BadPlasmidChromosomeLinkageName",
24675         eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + kMoreThan240 + "'");
24676     TestRepliconTaxname(CSubSource::eSubtype_linkage_group, true);
24677     TestRepliconForbiddenWords(CSubSource::eSubtype_linkage_group, true);
24678 
24679     // plasmid-name
24680     TestOneReplicon(CSubSource::eSubtype_plasmid_name, "megaplasmid", "", eDiag_Info, "");
24681     TestOneReplicon(CSubSource::eSubtype_plasmid_name, kMoreThan32, "BadPlasmidChromosomeLinkageName",
24682         eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + kMoreThan32 + "'");
24683     TestOneReplicon(CSubSource::eSubtype_plasmid_name, kMoreThan240, "BadPlasmidChromosomeLinkageName",
24684         eDiag_Error, "Problematic plasmid/chromosome/linkage group name '" + kMoreThan240 + "'");
24685     TestRepliconTaxname(CSubSource::eSubtype_plasmid_name, true);
24686     TestRepliconForbiddenWords(CSubSource::eSubtype_plasmid_name, true);
24687 
24688 //    TestOneReplicon(CSubSource::eSubtype_plasmid_name, "pCHRO.01", "", eDiag_Info, "");
24689 
24690 }
24691 
24692 
CheckHost(const CBioseq & seq,const string & host)24693 void CheckHost(const CBioseq& seq, const string& host)
24694 {
24695     bool found_host = false;
24696     BOOST_CHECK_EQUAL(seq.IsSetDescr(), true);
24697     if (!seq.IsSetDescr()) {
24698         return;
24699     }
24700     for (auto d : seq.GetDescr().Get()) {
24701         if (d->IsSource() && d->GetSource().IsSetOrgMod()) {
24702             for (auto om : d->GetSource().GetOrg().GetOrgname().GetMod()) {
24703                 if (om->IsSetSubtype() && om->GetSubtype() == COrgMod::eSubtype_nat_host) {
24704                     BOOST_CHECK_EQUAL(host, om->IsSetSubname() ? om->GetSubname() : kEmptyStr);
24705                     found_host = true;
24706                 }
24707             }
24708         }
24709     }
24710     BOOST_CHECK_EQUAL(found_host, true);
24711 }
24712 
24713 
CheckOneSpecificHost(const string & orig,const string & newval)24714 void CheckOneSpecificHost(const string& orig, const string& newval)
24715 {
24716     CRef<CSeq_entry> entry = BuildGoodSeq();
24717     SetOrgMod(entry, COrgMod::eSubtype_nat_host, orig);
24718     CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
24719     CScope scope(*objmgr);
24720     scope.AddDefaults();
24721     CSeq_entry_Handle seh = scope.AddTopLevelSeqEntry(*entry);
24722 
24723     validator::CTaxValidationAndCleanup tval;
24724 
24725     BOOST_CHECK_EQUAL(tval.DoTaxonomyUpdate(seh, true), true);
24726     CheckHost(entry->GetSeq(), newval);
24727 }
24728 
24729 
BOOST_AUTO_TEST_CASE(Test_VR_812)24730 BOOST_AUTO_TEST_CASE(Test_VR_812)
24731 {
24732     CheckOneSpecificHost("Canis familiaris", "Canis lupus familiaris");
24733     CheckOneSpecificHost("Canis familiaris; some other information", "Canis familiaris; some other information");
24734     CheckOneSpecificHost("Hordeum spontaneum cultivar test", "Hordeum spontaneum cultivar test");
24735 }
24736 
24737 
BOOST_AUTO_TEST_CASE(Test_BIOS_1527)24738 BOOST_AUTO_TEST_CASE(Test_BIOS_1527)
24739 {
24740     BOOST_CHECK_EQUAL("Acropora valida", FixSpecificHost("Acropora tumida"));
24741     BOOST_CHECK_EQUAL("Rhaponticum repens", FixSpecificHost("Acroptilon repens"));
24742 }
24743 
24744 
BOOST_AUTO_TEST_CASE(Test_VR_814)24745 BOOST_AUTO_TEST_CASE(Test_VR_814)
24746 {
24747     CRef<CSeq_entry> entry = BuildGoodSeq();
24748     SetLineage(entry, "Viroids;");
24749     SetSubSource(entry, CSubSource::eSubtype_tissue_type, "X");
24750 
24751     STANDARD_SETUP
24752 
24753     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "InvalidTissueType",
24754         "Viroid has unexpected tissue-type qualifier"));
24755     //AddChromosomeNoLocation(expected_errors, entry);
24756 
24757     eval = validator.Validate(seh, options);
24758 
24759     CheckErrors (*eval, expected_errors);
24760 
24761     CLEAR_ERRORS
24762 
24763 }
24764 
24765 
BOOST_AUTO_TEST_CASE(Test_VR_819)24766 BOOST_AUTO_TEST_CASE(Test_VR_819)
24767 {
24768     // prepare entry
24769     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
24770     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_country, "Denmark: Aarhus Bay Station M5");
24771     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_lat_lon, "56.1033 N 10.4578 E");
24772 
24773     STANDARD_SETUP
24774 
24775     eval = validator.Validate(seh, options);
24776     //AddChromosomeNoLocation(expected_errors, entry);
24777     CheckErrors(*eval, expected_errors);
24778 
24779     CSubSource::ELatLonCountryErr lcErr;
24780     string latlon = "56.1033 N 10.4578 E";
24781     string retval = CSubSource::ValidateLatLonCountry("Denmark: Aarhus Bay Station M5", latlon, true, lcErr);
24782     string enumval = lcErr == CSubSource::eLatLonCountryErr_Value ? "eLatLonCountryErr_Value" : "not error enum";
24783     BOOST_CHECK_EQUAL(retval, kEmptyStr);
24784 
24785     CLEAR_ERRORS
24786 }
24787 
24788 
BOOST_AUTO_TEST_CASE(Test_VR_824)24789 BOOST_AUTO_TEST_CASE(Test_VR_824)
24790 {
24791     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
24792     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_type_material, "allotype of Bonnetina tenuiverpis");
24793 
24794     STANDARD_SETUP
24795 
24796     eval = validator.Validate(seh, options);
24797     //AddChromosomeNoLocation(expected_errors, entry);
24798     CheckErrors(*eval, expected_errors);
24799 
24800     CLEAR_ERRORS
24801 }
24802 
24803 
TestNewAccessionOnNuc(const string & accession,bool is_prot_acc,bool is_wgs)24804 void TestNewAccessionOnNuc(const string& accession, bool is_prot_acc, bool is_wgs)
24805 {
24806     CRef<CSeq_entry> entry = BuildGoodSeq();
24807     CRef<CSeq_id> new_id(new CSeq_id());
24808     new_id->SetGenbank().SetAccession(accession);
24809     entry->SetSeq().SetId().push_back(new_id);
24810 
24811     STANDARD_SETUP
24812 
24813     string acc_str = "gb|" + accession + "|";
24814     if (is_wgs) {
24815         expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error,
24816             "InconsistentMolInfoTechnique",
24817             "WGS accession should have Mol-info.tech of wgs"));
24818     }
24819     if (is_prot_acc) {
24820         expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "BadSeqIdFormat", "Bad accession " + accession));
24821     }
24822     //AddChromosomeNoLocation(expected_errors, acc_str);
24823     eval = validator.Validate(seh, options);
24824     CheckErrors(*eval, expected_errors);
24825 
24826     CLEAR_ERRORS
24827 
24828     scope.RemoveTopLevelSeqEntry(seh);
24829     SetTech(entry, CMolInfo::eTech_wgs);
24830     seh = scope.AddTopLevelSeqEntry(*entry);
24831     if (is_prot_acc) {
24832         expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "BadSeqIdFormat", "Bad accession " + accession));
24833     }
24834     if (!is_wgs) {
24835         expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "InconsistentWGSFlags", "Mol-info.tech of wgs should have WGS accession"));
24836     }
24837     AddChromosomeNoLocation(expected_errors, acc_str);
24838     eval = validator.Validate(seh, options);
24839     CheckErrors(*eval, expected_errors);
24840 
24841     CLEAR_ERRORS
24842 }
24843 
24844 
TestNewAccessionOnStandaloneProt(const string & accession,bool is_nuc_acc,bool is_wgs)24845 void TestNewAccessionOnStandaloneProt(const string& accession, bool is_nuc_acc, bool is_wgs)
24846 {
24847     CRef<CSeq_entry> entry = BuildGoodProtSeq();
24848     CRef<CSeq_id> new_id(new CSeq_id());
24849     new_id->SetGenbank().SetAccession(accession);
24850     entry->SetSeq().SetId().push_back(new_id);
24851 
24852     STANDARD_SETUP
24853 
24854     string acc_str = "gb|" + accession + "|";
24855     expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "OrphanedProtein", "Orphaned stand-alone protein"));
24856     if (is_nuc_acc) {
24857         expected_errors.push_back(new CExpectedError(acc_str, eDiag_Error, "BadSeqIdFormat", "Bad accession " + accession));
24858     }
24859     //AddChromosomeNoLocation(expected_errors, acc_str);
24860     eval = validator.Validate(seh, options);
24861     CheckErrors(*eval, expected_errors);
24862 
24863     CLEAR_ERRORS
24864 }
24865 
24866 
TestNewAccessionOnNucProt(const string & n_acc,const string & p_acc,bool is_wgs)24867 void TestNewAccessionOnNucProt(const string& n_acc, const string& p_acc, bool is_wgs)
24868 {
24869     CRef<CSeq_entry> entry = BuildGoodNucProtSet();
24870     CRef<CSeq_id> nid(new CSeq_id());
24871     nid->SetGenbank().SetAccession(n_acc);
24872     unit_test_util::ChangeNucProtSetNucId(entry, nid);
24873     CRef<CSeq_id> pid(new CSeq_id());
24874     pid->SetGenbank().SetAccession(p_acc);
24875     unit_test_util::ChangeNucProtSetProteinId(entry, pid);
24876     STANDARD_SETUP
24877 
24878     if (is_wgs) {
24879         expected_errors.push_back(new CExpectedError("gb|" + n_acc + "|", eDiag_Error, "InconsistentMolInfoTechnique", "WGS accession should have Mol-info.tech of wgs"));
24880     }
24881     eval = validator.Validate(seh, options);
24882     //AddChromosomeNoLocation(expected_errors, entry);
24883     CheckErrors(*eval, expected_errors);
24884 
24885     CLEAR_ERRORS
24886 }
24887 
24888 
TestNewAccessionAsInference(const string & acc)24889 void TestNewAccessionAsInference(const string& acc)
24890 {
24891     CRef<CSeq_entry> entry = BuildGoodSeq();
24892     CRef<CSeq_feat> misc = AddMiscFeature(entry);
24893     misc->SetQual().push_back(CRef<CGb_qual>(new CGb_qual("inference", "similar to DNA sequence:INSD:" + acc + ".1")));
24894 
24895     STANDARD_SETUP
24896 
24897     eval = validator.Validate(seh, options);
24898     //AddChromosomeNoLocation(expected_errors, entry);
24899     CheckErrors(*eval, expected_errors);
24900 
24901     CLEAR_ERRORS
24902 }
24903 
24904 
BOOST_AUTO_TEST_CASE(Test_SQD_4560)24905 BOOST_AUTO_TEST_CASE(Test_SQD_4560)
24906 {
24907     // new accession formats
24908     TestNewAccessionOnNuc("AAAAAB010000001", false, true);
24909     TestNewAccessionOnNuc("AA12345678", false, false);
24910     TestNewAccessionOnNuc("EAA0000015", true, true);
24911     TestNewAccessionOnStandaloneProt("AAAAAB010000001", true, true);
24912     TestNewAccessionOnStandaloneProt("AA12345678", true, false);
24913     TestNewAccessionOnStandaloneProt("EAA0000015", false, true);
24914 
24915     TestNewAccessionOnNucProt("AAAAAB010000001", "EAA0000015", true);
24916     TestNewAccessionOnNucProt("AA12345678", "EAA0000015", false);
24917 
24918     TestNewAccessionAsInference("AAAAAB010000001");
24919     TestNewAccessionAsInference("AA12345678");
24920     TestNewAccessionAsInference("EAA0000015");
24921 }
24922 
BOOST_AUTO_TEST_CASE(Test_VR_852)24923 BOOST_AUTO_TEST_CASE(Test_VR_852)
24924 {
24925     BOOST_CHECK_EQUAL("unclassified sequences", objects::validator::FixSpecificHost("unclassified sequences"));
24926 }
24927 
24928 
BOOST_AUTO_TEST_CASE(Test_VR_875)24929 BOOST_AUTO_TEST_CASE(Test_VR_875)
24930 {
24931     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
24932     unit_test_util::SetTaxname(entry, "Phascolarctobacterium sp.");
24933     unit_test_util::SetTaxon(entry, 0);
24934     unit_test_util::SetTaxon(entry, 2049039);
24935     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_isolate, "P2B-1");
24936     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_isolation_source, "human stool");
24937     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_nat_host, "Homo sapiens");
24938     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_environmental_sample, "TRUE");
24939     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_metagenomic, "TRUE");
24940     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_metagenome_source, "human gut metagenome");
24941     unit_test_util::SetSubSource(entry, CSubSource::eSubtype_collection_date, "2018");
24942 
24943     STANDARD_SETUP
24944 
24945     eval = validator.Validate(seh, options);
24946     //AddChromosomeNoLocation(expected_errors, entry);
24947     CheckErrors(*eval, expected_errors);
24948 
24949     CLEAR_ERRORS
24950 
24951     options |= CValidator::eVal_genome_submission;
24952     eval = validator.Validate(seh, options);
24953     //AddChromosomeNoLocation(expected_errors, entry);
24954     CheckErrors(*eval, expected_errors);
24955 
24956     CLEAR_ERRORS
24957 }
24958 
BOOST_AUTO_TEST_CASE(Test_RW_1063)24959 BOOST_AUTO_TEST_CASE(Test_RW_1063)
24960 {
24961     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
24962     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_serotype, "an innocuous value");
24963     STANDARD_SETUP
24964 
24965     // no errors, because not salmonella
24966     eval = validator.Validate(seh, options);
24967     CheckErrors(*eval, expected_errors);
24968 
24969     CLEAR_ERRORS
24970 
24971     // no salmonella errors because not first word
24972     unit_test_util::SetTaxname(entry, "Badforyou Salmonella");
24973     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrganismNotFound",
24974         "Organism not found in taxonomy database"));
24975     eval = validator.Validate(seh, options);
24976     CheckErrors(*eval, expected_errors);
24977 
24978     CLEAR_ERRORS
24979 
24980     unit_test_util::SetTaxname(entry, "Salmonella");
24981     eval = validator.Validate(seh, options);
24982 
24983     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error,
24984         "TaxonomyLookupProblem", "Organism name is 'Salmonella', taxonomy ID should be '590' but is '592768'"));
24985     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
24986         "TaxonomyIsSpeciesProblem", "Taxonomy lookup reports is_species_level FALSE"));
24987     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadOrgMod", "Salmonella organisms should use serovar instead of serotype."));
24988     CheckErrors(*eval, expected_errors);
24989 
24990     CLEAR_ERRORS
24991 
24992     unit_test_util::SetTaxname(entry, "Salmonella badforyou");
24993     eval = validator.Validate(seh, options);
24994     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrganismNotFound",
24995         "Organism not found in taxonomy database"));
24996     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Error, "BadOrgMod", "Salmonella organisms should use serovar instead of serotype."));
24997     CheckErrors(*eval, expected_errors);
24998 
24999     CLEAR_ERRORS
25000 
25001     scope.RemoveTopLevelSeqEntry(seh);
25002     entry->SetSeq().SetId().push_back(unit_test_util::BuildRefSeqId());
25003     seh = scope.AddTopLevelSeqEntry(*entry);
25004     eval = validator.Validate(seh, options);
25005     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "OrganismNotFound",
25006         "Organism not found in taxonomy database"));
25007     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "BadOrgMod", "Salmonella organisms should use serovar instead of serotype."));
25008     CheckErrors(*eval, expected_errors);
25009 
25010     CLEAR_ERRORS
25011 
25012     // presence of serovar
25013     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_serovar, "different value");
25014     eval = validator.Validate(seh, options);
25015     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "OrganismNotFound",
25016         "Organism not found in taxonomy database"));
25017     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "BadOrgMod", "Salmonella organisms should use serovar instead of serotype."));
25018     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "BadOrgMod", "Salmonella organism name should contain the serovar value."));
25019     CheckErrors(*eval, expected_errors);
25020 
25021     CLEAR_ERRORS
25022 
25023     unit_test_util::SetTaxname(entry, "Salmonella badforyou different value");
25024     eval = validator.Validate(seh, options);
25025     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Warning, "OrganismNotFound",
25026         "Organism not found in taxonomy database"));
25027     expected_errors.push_back(new CExpectedError("ref|NC_123456|", eDiag_Error, "BadOrgMod", "Salmonella organisms should use serovar instead of serotype."));
25028     CheckErrors(*eval, expected_errors);
25029 
25030     CLEAR_ERRORS
25031 
25032 
25033 }
25034 
25035 
BOOST_AUTO_TEST_CASE(Test_RW_1064)25036 BOOST_AUTO_TEST_CASE(Test_RW_1064)
25037 {
25038     CRef<CSeq_entry> entry = unit_test_util::BuildGoodSeq();
25039     unit_test_util::SetTaxname(entry, "Streptococcus agalactiae NEM316");
25040     unit_test_util::SetTaxon(entry, 0);
25041     unit_test_util::SetTaxon(entry, 211110);
25042     unit_test_util::SetOrgMod(entry, COrgMod::eSubtype_serovar, "an innocuous value");
25043     STANDARD_SETUP
25044 
25045     // no errors, because not salmonella
25046     eval = validator.Validate(seh, options);
25047     CheckErrors(*eval, expected_errors);
25048 
25049     CLEAR_ERRORS
25050 
25051     unit_test_util::SetTaxname(entry, "Salmonella badforyou");
25052     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "OrganismNotFound",
25053         "Organism not found in taxonomy database"));
25054     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning, "BadOrgMod", "Salmonella organism name should contain the serovar value."));
25055     eval = validator.Validate(seh, options);
25056     CheckErrors(*eval, expected_errors);
25057 
25058     CLEAR_ERRORS
25059 
25060 }
25061 
25062 #if 0
25063 BOOST_AUTO_TEST_CASE(Test_TM_897)
25064 {
25065     CNcbiEnvironment env;
25066     env.Set("NI_SERVICE_NAME_TAXON3", "TaxService3v4test");
25067 
25068     CRef<CSeq_entry> entry = BuildGoodSeq();
25069     SetTaxname(entry, "Salmonela enterica");
25070     STANDARD_SETUP
25071 
25072     expected_errors.push_back(new CExpectedError("lcl|good", eDiag_Warning,
25073         "OrganismNotFound",
25074         "Organism not found. Possible matches|Salmonella enterica|Salmonella enterica V|Salmonella enterica subsp. V"));
25075 
25076     eval = validator.Validate(seh, options);
25077     CheckErrors(*eval, expected_errors);
25078 
25079 
25080     CLEAR_ERRORS
25081 }
25082 #endif
25083