1 /* $Id: tax_validation_and_cleanup.cpp 636458 2021-08-24 17:53:54Z fukanchi $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin
27 *
28 * File Description:
29 * Tools for batch processing taxonomy-related validation and cleanup
30 * .......
31 *
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <objmgr/object_manager.hpp>
37
38 #include <serial/iterator.hpp>
39
40 #include <objects/seqfeat/BioSource.hpp>
41 #include <objects/seqfeat/OrgMod.hpp>
42 #include <objects/seqfeat/OrgName.hpp>
43 #include <objects/seqfeat/Org_ref.hpp>
44 #include <objects/seqfeat/Seq_feat.hpp>
45 #include <objects/seqfeat/SubSource.hpp>
46
47 #include <objmgr/bioseq_ci.hpp>
48 #include <objmgr/seqdesc_ci.hpp>
49 #include <objmgr/util/feature.hpp>
50
51 #include <objmgr/feat_ci.hpp>
52 #include <objmgr/scope.hpp>
53
54 #include <objects/taxon3/taxon3.hpp>
55 #include <objects/taxon3/Taxon3_reply.hpp>
56
57 #include <objtools/validator/validator.hpp>
58 #include <objtools/validator/validerror_imp.hpp>
59 #include <objtools/validator/tax_validation_and_cleanup.hpp>
60 #include <objtools/validator/utilities.hpp>
61
62 #define NCBI_USE_ERRCODE_X Objtools_Validator
63
64 BEGIN_NCBI_SCOPE
65 BEGIN_SCOPE(objects)
66 BEGIN_SCOPE(validator)
67 using namespace sequence;
68
69 const string kInvalidReplyMsg = "Taxonomy service returned invalid reply";
70
71
CQualifierRequest()72 CQualifierRequest::CQualifierRequest()
73 {
74 x_Init();
75 }
76
77
x_Init()78 void CQualifierRequest::x_Init()
79 {
80 m_ValuesToTry.clear();
81 m_RepliesProcessed = 0;
82 m_Descs.clear();
83 m_Feats.clear();
84 }
85
86
AddParent(CConstRef<CSeqdesc> desc,CConstRef<CSeq_entry> ctx)87 void CQualifierRequest::AddParent(CConstRef<CSeqdesc> desc, CConstRef<CSeq_entry> ctx)
88 {
89 m_Descs.push_back(TDescPair(desc, ctx));
90 }
91
92
AddParent(CConstRef<CSeq_feat> feat)93 void CQualifierRequest::AddParent(CConstRef<CSeq_feat> feat)
94 {
95 m_Feats.push_back(feat);
96 }
97
98
AddRequests(vector<CRef<COrg_ref>> & request_list) const99 void CQualifierRequest::AddRequests(vector<CRef<COrg_ref> >& request_list) const
100 {
101 for (auto it = m_ValuesToTry.begin(); it != m_ValuesToTry.end(); it++) {
102 CRef<COrg_ref> rq(new COrg_ref);
103 rq->SetTaxname(*it);
104 request_list.push_back(rq);
105 }
106 }
107
108
MatchTryValue(const string & val) const109 bool CQualifierRequest::MatchTryValue(const string& val) const
110 {
111 for (auto it = m_ValuesToTry.begin(); it != m_ValuesToTry.end(); it++) {
112 if (NStr::EqualNocase(val, *it)) {
113 return true;
114 }
115 }
116 return false;
117 }
118
119
PostErrors(CValidError_imp & imp)120 void CQualifierRequest::PostErrors(CValidError_imp& imp)
121 {
122 vector<TTaxError> errs;
123 ListErrors(errs);
124 for (auto e : errs) {
125 for (auto it = m_Descs.begin(); it != m_Descs.end(); it++) {
126 imp.PostObjErr(e.severity, e.err_type, e.err_msg, *(it->first), it->second);
127 }
128 for (auto it = m_Feats.begin(); it != m_Feats.end(); it++) {
129 imp.PostObjErr(e.severity, e.err_type, e.err_msg, **it);
130 }
131 }
132 }
133
134
CSpecificHostRequest(const string & host,const COrg_ref & org,bool for_fix)135 CSpecificHostRequest::CSpecificHostRequest(const string& host, const COrg_ref& org, bool for_fix) :
136 CQualifierRequest(),
137 m_Host(host),
138 m_Response(eUnrecognized),
139 m_HostLineage(kEmptyStr),
140 m_OrgLineage(kEmptyStr)
141 {
142 string host_check = SpecificHostValueToCheck(host);
143 if (NStr::IsBlank(host_check)) {
144 m_Response = eNormal;
145 return;
146 }
147 if (!for_fix && !NStr::Equal(host, host_check)) {
148 m_ValuesToTry.push_back(host_check);
149 }
150 m_ValuesToTry.push_back(host);
151
152 m_SuggestedFix.clear();
153 if (org.IsSetLineage()) {
154 m_OrgLineage = org.GetLineage();
155 }
156 }
157
158
AddReply(const CT3Reply & reply)159 void CSpecificHostRequest::AddReply(const CT3Reply& reply)
160 {
161 if (m_Response == eAmbiguous) {
162 string new_error = InterpretSpecificHostResult(m_ValuesToTry[m_RepliesProcessed], reply, m_Host);
163 if (NStr::IsBlank(new_error)) {
164 m_Response = eNormal;
165 m_SuggestedFix = m_Host;
166 m_HostLineage = reply.GetData().GetOrg().IsSetLineage() ? reply.GetData().GetOrg().GetLineage() : kEmptyStr;
167 m_Error = kEmptyStr;
168 }
169 } else if (m_Response == eUnrecognized) {
170 m_Error = InterpretSpecificHostResult(m_ValuesToTry[m_RepliesProcessed], reply, m_Host);
171 if (NStr::IsBlank(m_Error)) {
172 m_Response = eNormal;
173 m_SuggestedFix = m_Host;
174 m_HostLineage = reply.GetData().GetOrg().IsSetLineage() ? reply.GetData().GetOrg().GetLineage() : kEmptyStr;
175 } else if (NStr::Find(m_Error, "ambiguous") != NPOS) {
176 m_Response = eAmbiguous;
177 } else if (NStr::StartsWith(m_Error, "Invalid value for specific host") && !IsLikelyTaxname(m_Host)) {
178 m_Response = eNormal;
179 m_SuggestedFix = m_Host;
180 } else if (NStr::StartsWith(m_Error, "Specific host value is alternate name")) {
181 m_Response = eAlternateName;
182 m_SuggestedFix = reply.GetData().GetOrg().GetTaxname();
183 m_HostLineage = reply.GetData().GetOrg().IsSetLineage() ? reply.GetData().GetOrg().GetLineage() : kEmptyStr;
184 } else {
185 m_Response = eUnrecognized;
186 if (NStr::IsBlank(m_SuggestedFix) && reply.IsData() && reply.GetData().IsSetOrg()) {
187 if (HasMisSpellFlag(reply.GetData())) {
188 m_SuggestedFix = reply.GetData().GetOrg().GetTaxname();
189 m_HostLineage = reply.GetData().GetOrg().IsSetLineage() ? reply.GetData().GetOrg().GetLineage() : kEmptyStr;
190 } else if (!FindMatchInOrgRef(m_Host, reply.GetData().GetOrg())
191 && !IsCommonName(reply.GetData())) {
192 m_SuggestedFix = reply.GetData().GetOrg().GetTaxname();
193 m_HostLineage = reply.GetData().GetOrg().IsSetLineage() ? reply.GetData().GetOrg().GetLineage() : kEmptyStr;
194 }
195 }
196 }
197 }
198 m_RepliesProcessed++;
199 }
200
201
ListErrors(vector<TTaxError> & errs) const202 void CSpecificHostRequest::ListErrors(vector<TTaxError>& errs) const
203 {
204 switch (m_Response) {
205 case eNormal:
206 break;
207 case eAmbiguous:
208 errs.push_back(TTaxError{ eDiag_Info, eErr_SEQ_DESCR_AmbiguousSpecificHost, m_Error });
209 break;
210 case eUnrecognized:
211 errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_BadSpecificHost, m_Error });
212 break;
213 case eAlternateName:
214 errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_BadSpecificHost, m_Error });
215 break;
216 }
217
218 if (!NStr::IsBlank(m_HostLineage) && !NStr::IsBlank(m_OrgLineage) &&
219 (NStr::Find(m_OrgLineage, "Streptophyta") != NPOS || NStr::Find(m_OrgLineage, "Metazoa") != NPOS) &&
220 (NStr::Find(m_HostLineage, "Fungi;") != NPOS || NStr::Find(m_HostLineage, "Bacteria") != NPOS ||
221 NStr::Find(m_HostLineage, "Archaea") != NPOS || NStr::Find(m_HostLineage, "Viruses") != NPOS)) {
222 errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_BadSpecificHost,
223 "Suspect Host Value - a prokaryote, fungus or virus is suspect as a host for a plant or animal" });
224 }
225 }
226
227
228 //LCOV_EXCL_START
229 //used by cleanup
SuggestFix() const230 const string& CSpecificHostRequest::SuggestFix() const
231 {
232 if (m_ValuesToTry.empty()) {
233 return m_Host;
234 } else {
235 return m_SuggestedFix;
236 }
237 }
238 //LCOV_EXCL_STOP
239
240
x_IgnoreStrain(const string & str)241 bool CStrainRequest::x_IgnoreStrain(const string& str)
242 {
243 // per VR-762, ignore strain if combination of letters and numbers
244 bool has_number = false;
245 bool has_letter = false;
246 for (size_t i = 0; i < str.length(); i++) {
247 char ch = str.c_str()[i];
248 if (isdigit(ch)) {
249 has_number = true;
250 } else if (isalpha(ch)) {
251 has_letter = true;
252 } else {
253 return false;
254 }
255 }
256 if (!has_number || !has_letter) {
257 return false;
258 } else {
259 return true;
260 }
261 }
262
263
CStrainRequest(const string & strain,const COrg_ref & org)264 CStrainRequest::CStrainRequest(const string& strain, const COrg_ref& org) : CQualifierRequest(), m_Strain(strain)
265 {
266 if (org.IsSetTaxname()) {
267 m_Taxname = org.GetTaxname();
268 } else {
269 m_Taxname.clear();
270 }
271
272 m_IsInvalid = false;
273 if (NStr::IsBlank(strain) || x_IgnoreStrain(strain)) {
274 return;
275 }
276
277 m_ValuesToTry.push_back(strain);
278 size_t pos = 0;
279 while (strain[pos] != 0 && isalpha(strain[pos])) {
280 ++pos;
281 }
282 if (pos < strain.length() && pos >= 5) {
283 m_ValuesToTry.push_back(strain.substr(0, pos));
284 }
285
286 if (RequireTaxname(m_Taxname)) {
287 m_ValuesToTry.push_back(MakeKey(strain, m_Taxname));
288 }
289 }
290
291
MakeKey(const string & strain,const string & taxname)292 string CStrainRequest::MakeKey(const string& strain, const string& taxname)
293 {
294 if (RequireTaxname(taxname)) {
295 return taxname.substr(0, taxname.length() - 3) + strain;
296 } else {
297 return strain;
298 }
299 }
300
301
RequireTaxname(const string & taxname)302 bool CStrainRequest::RequireTaxname(const string& taxname)
303 {
304 if (NStr::EndsWith(taxname, " sp.")) {
305 return true;
306 } else {
307 return false;
308 }
309 }
310
311
x_IsUnwanted(const string & str)312 bool CStrainRequest::x_IsUnwanted(const string& str)
313 {
314 if (NStr::FindNoCase(str, "virus") != NPOS ||
315 NStr::FindNoCase(str, "viroid") != NPOS ||
316 NStr::FindNoCase(str, "vector") != NPOS ||
317 NStr::FindNoCase(str, "phage") != NPOS) {
318 return true;
319 } else {
320 return false;
321 }
322 }
323
324
Check(const COrg_ref & org)325 bool CStrainRequest::Check(const COrg_ref& org)
326 {
327 if (org.IsSetLineage() && x_IsUnwanted(org.GetLineage())) {
328 return false;
329 }
330 if (org.IsSetTaxname() && x_IsUnwanted(org.GetTaxname())) {
331 return false;
332 }
333 if (!org.IsSetOrgMod()) {
334 return false;
335 }
336 for (auto it : org.GetOrgname().GetMod()) {
337 if (it->IsSetSubtype() && it->IsSetSubname() &&
338 it->GetSubtype() == COrgMod::eSubtype_strain) {
339 return true;
340 }
341 }
342 return false;
343 }
344
345
ListErrors(vector<TTaxError> & errs) const346 void CStrainRequest::ListErrors(vector<TTaxError>& errs) const
347 {
348 if (m_IsInvalid) {
349 errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_StrainContainsTaxInfo,
350 "Strain '" + m_Strain + "' contains taxonomic name information" });
351 }
352 }
353
354
AddReply(const CT3Reply & reply)355 void CStrainRequest::AddReply(const CT3Reply& reply)
356 {
357 if (!m_IsInvalid) {
358 if (reply.IsData() && reply.GetData().IsSetOrg()) {
359 // TODO: if using just a one word input, make sure name is actually in taxname
360 if (m_ValuesToTry[m_RepliesProcessed].length() < m_Strain.length()) {
361 if (NStr::EqualNocase(m_ValuesToTry[m_RepliesProcessed], reply.GetData().GetOrg().GetTaxname())) {
362 m_IsInvalid = true;
363 }
364 } else {
365 m_IsInvalid = true;
366 }
367 }
368 }
369 m_RepliesProcessed++;
370 }
371
372
Clear()373 void CQualLookupMap::Clear()
374 {
375 m_Populated = false;
376 m_Map.clear();
377 }
378
379
AddDesc(CConstRef<CSeqdesc> desc,CConstRef<CSeq_entry> ctx)380 void CQualLookupMap::AddDesc(CConstRef<CSeqdesc> desc, CConstRef<CSeq_entry> ctx)
381 {
382 m_Populated = true;
383 if (!desc->IsSource() || !desc->GetSource().IsSetOrg()) {
384 return;
385 }
386 const COrg_ref& org = desc->GetSource().GetOrg();
387 if (!org.IsSetOrgMod()) {
388 return;
389 }
390 if (!Check(org)) {
391 return;
392 }
393 for (auto mod_it = org.GetOrgname().GetMod().begin(); mod_it != org.GetOrgname().GetMod().end(); mod_it++) {
394 if ((*mod_it)->IsSetSubtype()
395 && (*mod_it)->GetSubtype() == m_Subtype
396 && (*mod_it)->IsSetSubname()) {
397 string qual = (*mod_it)->GetSubname();
398 string key = GetKey(qual, org);
399 TQualifierRequests::iterator find = m_Map.find(key);
400 if (find == m_Map.end()) {
401 m_Map[key] = x_MakeNewRequest(qual, org);
402 m_Map[key]->AddParent(desc, ctx);
403 } else {
404 find->second->AddParent(desc, ctx);
405 }
406 }
407 }
408 }
409
410
AddFeat(CConstRef<CSeq_feat> feat)411 void CQualLookupMap::AddFeat(CConstRef<CSeq_feat> feat)
412 {
413 m_Populated = true;
414 if (!feat->IsSetData() || !feat->GetData().IsBiosrc() ||
415 !feat->GetData().GetBiosrc().IsSetOrg()) {
416 return;
417 }
418 const COrg_ref& org = feat->GetData().GetBiosrc().GetOrg();
419 if (!org.IsSetOrgMod()) {
420 return;
421 }
422 if (!Check(org)) {
423 return;
424 }
425 for (auto mod_it = org.GetOrgname().GetMod().begin(); mod_it != org.GetOrgname().GetMod().end(); mod_it++) {
426 if ((*mod_it)->IsSetSubtype()
427 && (*mod_it)->GetSubtype() == m_Subtype
428 && (*mod_it)->IsSetSubname()) {
429 string qual = (*mod_it)->GetSubname();
430 string key = GetKey(qual, feat->GetData().GetBiosrc().GetOrg());
431 TQualifierRequests::iterator find = m_Map.find(key);
432 if (find == m_Map.end()) {
433 m_Map[key] = x_MakeNewRequest(qual, feat->GetData().GetBiosrc().GetOrg());
434 m_Map[key]->AddParent(feat);
435 } else {
436 find->second->AddParent(feat);
437 }
438 }
439 }
440 }
441
442
AddOrg(const COrg_ref & org)443 void CQualLookupMap::AddOrg(const COrg_ref& org)
444 {
445 m_Populated = true;
446 if (!org.IsSetOrgMod()) {
447 return;
448 }
449 if (!Check(org)) {
450 return;
451 }
452 for (auto mod_it = org.GetOrgname().GetMod().begin(); mod_it != org.GetOrgname().GetMod().end(); mod_it++) {
453 if ((*mod_it)->IsSetSubtype()
454 && (*mod_it)->GetSubtype() == m_Subtype
455 && (*mod_it)->IsSetSubname()) {
456 string qual = (*mod_it)->GetSubname();
457 string key = GetKey(qual, org);
458 TQualifierRequests::iterator find = m_Map.find(key);
459 if (find == m_Map.end()) {
460 m_Map[key] = x_MakeNewRequest(qual, org);
461 }
462 }
463 }
464 }
465
466
467 //LCOV_EXCL_START
468 //only used by biosample
AddString(const string & val)469 void CQualLookupMap::AddString(const string& val)
470 {
471 m_Populated = true;
472 TQualifierRequests::iterator find = m_Map.find(val);
473 if (find == m_Map.end()) {
474 CRef<COrg_ref> org(new COrg_ref());
475 m_Map[val] = x_MakeNewRequest(val, *org);
476 }
477 }
478 //LCOV_EXCL_STOP
479
480
GetRequestList()481 vector<CRef<COrg_ref> > CQualLookupMap::GetRequestList()
482 {
483 vector<CRef<COrg_ref> > org_rq_list;
484 org_rq_list.reserve(m_Map.size());
485 for (auto it = m_Map.begin(); it != m_Map.end(); it++) {
486 it->second->AddRequests(org_rq_list);
487 }
488 return org_rq_list;
489 }
490
491
x_FindRequest(const string & val)492 CQualLookupMap::TQualifierRequests::iterator CQualLookupMap::x_FindRequest(const string& val)
493 {
494 TQualifierRequests::iterator map_it = m_Map.find(val);
495 if (map_it != m_Map.end() && map_it->second->NumRemainingReplies() > 0) {
496 return map_it;
497 }
498 map_it = m_Map.begin();
499 while (map_it != m_Map.end()) {
500 if (map_it->second->MatchTryValue(val) && map_it->second->NumRemainingReplies() > 0) {
501 return map_it;
502 }
503 ++map_it;
504 }
505 return m_Map.end();
506 }
507
508
IncrementalUpdate(const vector<CRef<COrg_ref>> & input,const CTaxon3_reply & reply)509 string CQualLookupMap::IncrementalUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply)
510 {
511 string error_message;
512 CTaxon3_reply::TReply::const_iterator reply_it = reply.GetReply().begin();
513 vector<CRef<COrg_ref> >::const_iterator rq_it = input.begin();
514
515 while (reply_it != reply.GetReply().end() && rq_it != input.end()) {
516 TQualifierRequests::iterator map_it = x_FindRequest((*rq_it)->GetTaxname());
517 if (map_it == m_Map.end()) {
518 error_message = "Unexpected taxonomy response for " + (*rq_it)->GetTaxname();
519 return error_message;
520 }
521 map_it->second->AddReply(**reply_it);
522 ++rq_it;
523 ++reply_it;
524 }
525
526 if (reply_it != reply.GetReply().end()) {
527 error_message = "Unexpected taxonomy responses for " + COrgMod::GetSubtypeName(m_Subtype);
528 }
529 return kEmptyStr;
530 }
531
532
533 //LCOV_EXCL_START
534 //only used for cleanup
IsUpdateComplete() const535 bool CQualLookupMap::IsUpdateComplete() const
536 {
537 TQualifierRequests::const_iterator rq_it = m_Map.cbegin();
538 while (rq_it != m_Map.cend()) {
539 if (rq_it->second->NumRemainingReplies() > 0) {
540 return false;
541 break;
542 }
543 ++rq_it;
544 }
545 return true;
546 }
547 //LCOV_EXCL_STOP
548
549
PostErrors(CValidError_imp & imp)550 void CQualLookupMap::PostErrors(CValidError_imp& imp)
551 {
552 TQualifierRequests::iterator rq_it = m_Map.begin();
553 while (rq_it != m_Map.end()) {
554 rq_it->second->PostErrors(imp);
555 ++rq_it;
556 }
557 }
558
559
560 //LCOV_EXCL_START
561 //only used by biosample
ListErrors(vector<TTaxError> & errs) const562 void CQualLookupMap::ListErrors(vector<TTaxError>& errs) const
563 {
564 for (auto rq_it : m_Map) {
565 rq_it.second->ListErrors(errs);
566 }
567 }
568
569
570 //LCOV_EXCL_STOP
571
572
x_MakeNewRequest(const string & orig_val,const COrg_ref & org)573 CRef<CQualifierRequest> CSpecificHostMap::x_MakeNewRequest(const string& orig_val, const COrg_ref& org)
574 {
575 CRef<CQualifierRequest> rq(new CSpecificHostRequest(orig_val, org));
576 return rq;
577 }
578
579
580 //LCOV_EXCL_START
581 //used for cleanup
x_MakeNewRequest(const string & orig_val,const COrg_ref & org)582 CRef<CQualifierRequest> CSpecificHostMapForFix::x_MakeNewRequest(const string& orig_val, const COrg_ref& org)
583 {
584 CRef<CQualifierRequest> rq(new CSpecificHostRequest(orig_val, org, true));
585 return rq;
586 }
587
588
x_DefaultSpecificHostAdjustments(const string & host_val)589 string CSpecificHostMapForFix::x_DefaultSpecificHostAdjustments(const string& host_val)
590 {
591 string adjusted = host_val;
592 NStr::TruncateSpacesInPlace(adjusted);
593 adjusted = COrgMod::FixHost(adjusted);
594 return adjusted;
595 }
596
597
ApplyToOrg(COrg_ref & org_ref) const598 bool CSpecificHostMapForFix::ApplyToOrg(COrg_ref& org_ref) const
599 {
600 if (!org_ref.IsSetOrgname() ||
601 !org_ref.GetOrgname().IsSetMod()) {
602 return false;
603 }
604
605 bool changed = false;
606
607 for (auto m = org_ref.SetOrgname().SetMod().begin(); m != org_ref.SetOrgname().SetMod().end(); m++) {
608 if ((*m)->IsSetSubtype() &&
609 (*m)->GetSubtype() == COrgMod::eSubtype_nat_host &&
610 (*m)->IsSetSubname()) {
611 string host_val = x_DefaultSpecificHostAdjustments((*m)->GetSubname());
612 TQualifierRequests::const_iterator it = m_Map.find(host_val);
613 if (it != m_Map.end()) {
614 const CSpecificHostRequest* rq = dynamic_cast<const CSpecificHostRequest *>(it->second.GetPointer());
615 string new_val = x_DefaultSpecificHostAdjustments(rq->SuggestFix());
616 if (!NStr::IsBlank(new_val) && !NStr::Equal(new_val, (*m)->GetSubname())) {
617 (*m)->SetSubname(new_val);
618 changed = true;
619 }
620 }
621 }
622 }
623
624 return changed;
625 }
626 //LCOV_EXCL_STOP
627
628
x_MakeNewRequest(const string & orig_val,const COrg_ref & org)629 CRef<CQualifierRequest> CStrainMap::x_MakeNewRequest(const string& orig_val, const COrg_ref& org)
630 {
631 CRef<CQualifierRequest> rq(new CStrainRequest(orig_val, org));
632 return rq;
633 }
634
635
CTaxValidationAndCleanup()636 CTaxValidationAndCleanup::CTaxValidationAndCleanup()
637 {
638 m_SrcDescs.clear();
639 m_DescCtxs.clear();
640 m_SrcFeats.clear();
641 m_SpecificHostRequests.clear();
642 m_SpecificHostRequestsBuilt = false;
643 m_SpecificHostRequestsUpdated = false;
644 m_StrainRequestsBuilt = false;
645 }
646
647
Init(const CSeq_entry & se)648 void CTaxValidationAndCleanup::Init(const CSeq_entry& se)
649 {
650 m_SrcDescs.clear();
651 m_DescCtxs.clear();
652 m_SrcFeats.clear();
653 m_SpecificHostRequests.clear();
654 m_SpecificHostRequestsBuilt = false;
655 m_SpecificHostRequestsUpdated = false;
656 m_StrainRequestsBuilt = false;
657 x_GatherSources(se);
658 }
659
660
GetTopReportObject() const661 CConstRef<CSeq_entry> CTaxValidationAndCleanup::GetTopReportObject() const
662 {
663 if (!m_DescCtxs.empty()) {
664 return m_DescCtxs.front();
665 } else {
666 return CConstRef<CSeq_entry>(NULL);
667 }
668 }
669
670
x_GatherSources(const CSeq_entry & se)671 void CTaxValidationAndCleanup::x_GatherSources(const CSeq_entry& se)
672 {
673 // get source descriptors
674 FOR_EACH_DESCRIPTOR_ON_SEQENTRY(it, se)
675 {
676 if ((*it)->IsSource() && (*it)->GetSource().IsSetOrg()) {
677 CConstRef<CSeqdesc> desc;
678 desc.Reset(*it);
679 m_SrcDescs.push_back(desc);
680 CConstRef<CSeq_entry> r_se;
681 r_se.Reset(&se);
682 m_DescCtxs.push_back(r_se);
683 }
684 }
685 // also get features
686 FOR_EACH_ANNOT_ON_SEQENTRY(annot_it, se)
687 {
688 FOR_EACH_SEQFEAT_ON_SEQANNOT(feat_it, **annot_it)
689 {
690 if ((*feat_it)->IsSetData() && (*feat_it)->GetData().IsBiosrc()
691 && (*feat_it)->GetData().GetBiosrc().IsSetOrg()) {
692 CConstRef<CSeq_feat> feat;
693 feat.Reset(*feat_it);
694 m_SrcFeats.push_back(feat);
695 }
696 }
697 }
698
699 // if set, recurse
700 if (se.IsSet()) {
701 FOR_EACH_SEQENTRY_ON_SEQSET(it, se.GetSet())
702 {
703 x_GatherSources(**it);
704 }
705 }
706 }
707
708
GetTaxonomyLookupRequest() const709 vector< CRef<COrg_ref> > CTaxValidationAndCleanup::GetTaxonomyLookupRequest() const
710 {
711 // request list for taxon3
712 vector< CRef<COrg_ref> > org_rq_list;
713
714 // first do descriptors
715 vector<CConstRef<CSeqdesc> >::const_iterator desc_it = m_SrcDescs.cbegin();
716 vector<CConstRef<CSeq_entry> >::const_iterator ctx_it = m_DescCtxs.cbegin();
717 while (desc_it != m_SrcDescs.cend() && ctx_it != m_DescCtxs.cend()) {
718 CRef<COrg_ref> rq(new COrg_ref);
719 const COrg_ref& org = (*desc_it)->GetSource().GetOrg();
720 rq->Assign(org);
721 org_rq_list.push_back(rq);
722
723 ++desc_it;
724 ++ctx_it;
725 }
726
727 // now do features
728 vector<CConstRef<CSeq_feat> >::const_iterator feat_it = m_SrcFeats.cbegin();
729 while (feat_it != m_SrcFeats.cend()) {
730 CRef<COrg_ref> rq(new COrg_ref);
731 const COrg_ref& org = (*feat_it)->GetData().GetBiosrc().GetOrg();
732 rq->Assign(org);
733 org_rq_list.push_back(rq);
734
735 ++feat_it;
736 }
737 return org_rq_list;
738 }
739
740
x_InterpretTaxonomyError(const CT3Error & error,const COrg_ref & org,const EErrType type,vector<TTaxError> & errs) const741 void CTaxValidationAndCleanup::x_InterpretTaxonomyError(const CT3Error& error, const COrg_ref& org, const EErrType type, vector<TTaxError>& errs) const
742 {
743 const string err_str = error.IsSetMessage() ? error.GetMessage() : "?";
744
745 if (NStr::Equal(err_str, "Organism not found")) {
746 string msg = "Organism not found in taxonomy database";
747 if (error.IsSetOrg() && error.GetOrg().IsSetTaxname() &&
748 !NStr::Equal(error.GetOrg().GetTaxname(), "Not valid") &&
749 (!org.IsSetTaxname() ||
750 !NStr::Equal(org.GetTaxname(), error.GetOrg().GetTaxname()))) {
751 msg += " (suggested:" + error.GetOrg().GetTaxname() + ")";
752 }
753 errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_OrganismNotFound, msg });
754 } else if (NStr::StartsWith(err_str, "Organism not found. Possible matches")) {
755 errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_OrganismNotFound, err_str });
756 } else if (NStr::Equal(err_str, kInvalidReplyMsg)) {
757 errs.push_back(TTaxError{ eDiag_Error, eErr_SEQ_DESCR_TaxonomyLookupProblem, err_str });
758 } else if (NStr::Find(err_str, "ambiguous name") != NPOS) {
759 errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_TaxonomyAmbiguousName,
760 "Taxonomy lookup failed with message '" + err_str + "'"});
761 } else {
762 errs.push_back(TTaxError{ eDiag_Warning, type,
763 "Taxonomy lookup failed with message '" + err_str + "'" });
764 }
765 }
766
767
ListTaxLookupErrors(const CT3Reply & reply,const COrg_ref & org,CBioSource::TGenome genome,bool is_insd_patent,bool is_wp,vector<TTaxError> & errs) const768 void CTaxValidationAndCleanup::ListTaxLookupErrors
769 (const CT3Reply& reply, const COrg_ref& org, CBioSource::TGenome genome, bool is_insd_patent, bool is_wp, vector<TTaxError>& errs) const
770 {
771 if (reply.IsError()) {
772 x_InterpretTaxonomyError(reply.GetError(), org, eErr_SEQ_DESCR_TaxonomyLookupProblem, errs);
773 } else if (reply.IsData()) {
774 bool is_species_level = true;
775 bool is_unidentified = false;
776 bool force_consult = false;
777 bool has_nucleomorphs = false;
778 if (reply.GetData().IsSetOrg()) {
779 const COrg_ref& orp_rep = reply.GetData().GetOrg();
780 if (org.IsSetTaxname() && orp_rep.IsSetTaxname()) {
781 const string& taxname_req = org.GetTaxname();
782 const string& taxname_rep = orp_rep.GetTaxname();
783 if (NStr::Equal(taxname_rep, "unidentified")) {
784 is_unidentified = true;
785 }
786 TTaxId taxid_request = org.GetTaxId();
787 TTaxId taxid_reply = orp_rep.GetTaxId();
788
789 if (taxid_request != ZERO_TAX_ID && taxid_reply != ZERO_TAX_ID && taxid_request != taxid_reply) {
790 errs.push_back(TTaxError{ eDiag_Error, eErr_SEQ_DESCR_TaxonomyLookupProblem,
791 "Organism name is '" + taxname_req
792 + "', taxonomy ID should be '" + NStr::NumericToString(taxid_reply)
793 + "' but is '" + NStr::NumericToString(taxid_request) + "'" });
794 }
795 }
796 }
797 reply.GetData().GetTaxFlags(is_species_level, force_consult, has_nucleomorphs);
798 if (!is_species_level && !is_wp) {
799 errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_TaxonomyIsSpeciesProblem,
800 "Taxonomy lookup reports is_species_level FALSE"});
801 }
802 if (force_consult) {
803 if (is_insd_patent && is_unidentified) {
804 force_consult = false;
805 }
806 }
807 if (force_consult) {
808 errs.push_back(TTaxError{eDiag_Warning, eErr_SEQ_DESCR_TaxonomyConsultRequired,
809 "Taxonomy lookup reports taxonomy consultation needed"});
810 }
811 if (genome == CBioSource::eGenome_nucleomorph
812 && !has_nucleomorphs) {
813 errs.push_back(TTaxError{eDiag_Warning, eErr_SEQ_DESCR_TaxonomyNucleomorphProblem,
814 "Taxonomy lookup does not have expected nucleomorph flag"});
815 } else if (genome == CBioSource::eGenome_plastid
816 && (!reply.GetData().HasPlastids())) {
817 errs.push_back(TTaxError{eDiag_Warning, eErr_SEQ_DESCR_TaxonomyPlastidsProblem,
818 "Taxonomy lookup does not have expected plastid flag"});
819 }
820 }
821
822 }
823
ReportTaxLookupErrors(const CTaxon3_reply & reply,CValidError_imp & imp,bool is_insd_patent) const824 void CTaxValidationAndCleanup::ReportTaxLookupErrors
825 (const CTaxon3_reply& reply,
826 CValidError_imp& imp,
827 bool is_insd_patent) const
828 {
829 CTaxon3_reply::TReply::const_iterator reply_it = reply.GetReply().begin();
830
831 // process descriptor responses
832 vector<CConstRef<CSeqdesc> >::const_iterator desc_it = m_SrcDescs.cbegin();
833 vector<CConstRef<CSeq_entry> >::const_iterator ctx_it = m_DescCtxs.cbegin();
834
835 while (reply_it != reply.GetReply().end()
836 && desc_it != m_SrcDescs.cend()
837 && ctx_it != m_DescCtxs.cend()) {
838 vector<TTaxError> errs;
839 const COrg_ref& orp_req = (*desc_it)->GetSource().GetOrg();
840 ListTaxLookupErrors(**reply_it, orp_req,
841 (*desc_it)->GetSource().IsSetGenome() ? (*desc_it)->GetSource().GetGenome() : CBioSource::eGenome_unknown,
842 is_insd_patent, imp.IsWP(), errs);
843 for (auto it : errs) {
844 imp.PostObjErr(it.severity, it.err_type, it.err_msg, **desc_it, *ctx_it);
845 }
846 ++reply_it;
847 ++desc_it;
848 ++ctx_it;
849 }
850 // process feat responses
851 vector<CConstRef<CSeq_feat> >::const_iterator feat_it = m_SrcFeats.cbegin();
852 while (reply_it != reply.GetReply().cend()
853 && feat_it != m_SrcFeats.end()) {
854 vector<TTaxError> errs;
855 const COrg_ref& orp_req = (*feat_it)->GetData().GetBiosrc().GetOrg();
856 ListTaxLookupErrors(**reply_it, orp_req,
857 (*feat_it)->GetData().GetBiosrc().IsSetGenome() ? (*feat_it)->GetData().GetBiosrc().GetGenome() : CBioSource::eGenome_unknown,
858 is_insd_patent, imp.IsWP(), errs);
859 for (auto it : errs) {
860 imp.PostErr(it.severity, it.err_type, it.err_msg,* *feat_it);
861 }
862 ++reply_it;
863 ++feat_it;
864 }
865
866 }
867
868
ReportIncrementalTaxLookupErrors(const CTaxon3_reply & reply,CValidError_imp & imp,bool is_insd_patent,size_t offset) const869 void CTaxValidationAndCleanup::ReportIncrementalTaxLookupErrors
870 (const CTaxon3_reply& reply,
871 CValidError_imp& imp,
872 bool is_insd_patent,
873 size_t offset) const
874 {
875 // cout << MSerial_AsnText << reply << endl;
876
877 CTaxon3_reply::TReply::const_iterator reply_it = reply.GetReply().begin();
878
879 // process descriptor responses
880 vector<CConstRef<CSeqdesc> >::const_iterator desc_it = m_SrcDescs.cbegin();
881 vector<CConstRef<CSeq_entry> >::const_iterator ctx_it = m_DescCtxs.cbegin();
882
883 size_t skipped = 0;
884 while (skipped < offset
885 && desc_it != m_SrcDescs.cend()
886 && ctx_it != m_DescCtxs.cend()) {
887 ++desc_it;
888 ++ctx_it;
889 skipped++;
890 }
891
892 while (reply_it != reply.GetReply().end()
893 && desc_it != m_SrcDescs.cend()
894 && ctx_it != m_DescCtxs.cend()) {
895 vector<TTaxError> errs;
896 const COrg_ref& orp_req = (*desc_it)->GetSource().GetOrg();
897 ListTaxLookupErrors(**reply_it, orp_req,
898 (*desc_it)->GetSource().IsSetGenome() ? (*desc_it)->GetSource().GetGenome() : CBioSource::eGenome_unknown,
899 is_insd_patent, imp.IsWP(), errs);
900 for (auto it : errs) {
901 imp.PostObjErr(it.severity, it.err_type, it.err_msg, **desc_it, *ctx_it);
902 }
903 ++reply_it;
904 ++desc_it;
905 ++ctx_it;
906 }
907
908 if (reply_it == reply.GetReply().end()) {
909 return;
910 }
911 // process feat responses
912 vector<CConstRef<CSeq_feat> >::const_iterator feat_it = m_SrcFeats.cbegin();
913 while (skipped < offset && feat_it != m_SrcFeats.end()) {
914 ++feat_it;
915 skipped++;
916 }
917 while (reply_it != reply.GetReply().cend() &&
918 feat_it != m_SrcFeats.end()) {
919 vector<TTaxError> errs;
920 const COrg_ref& orp_req = (*feat_it)->GetData().GetBiosrc().GetOrg();
921 ListTaxLookupErrors(**reply_it, orp_req,
922 (*feat_it)->GetData().GetBiosrc().IsSetGenome() ? (*feat_it)->GetData().GetBiosrc().GetGenome() : CBioSource::eGenome_unknown,
923 is_insd_patent, imp.IsWP(), errs);
924 for (auto it : errs) {
925 imp.PostErr(it.severity, it.err_type, it.err_msg, **feat_it);
926 }
927 ++reply_it;
928 ++feat_it;
929 }
930
931
932 }
933
934
935
936 //LCOV_EXCL_START
937 //used by Genome Workbench
AdjustOrgRefsWithTaxLookupReply(const CTaxon3_reply & reply,vector<CRef<COrg_ref>> org_refs,string & error_message,bool use_error_orgrefs) const938 bool CTaxValidationAndCleanup::AdjustOrgRefsWithTaxLookupReply
939 ( const CTaxon3_reply& reply,
940 vector<CRef<COrg_ref> > org_refs,
941 string& error_message,
942 bool use_error_orgrefs) const
943 {
944 bool changed = false;
945 CTaxon3_reply::TReply::const_iterator reply_it = reply.GetReply().begin();
946 vector<CRef<COrg_ref> >::iterator org_it = org_refs.begin();
947 while (reply_it != reply.GetReply().end() && org_it != org_refs.end()) {
948 CRef<COrg_ref> cpy(NULL);
949 if ((*reply_it)->IsData() &&
950 (*reply_it)->GetData().IsSetOrg()) {
951 cpy.Reset(new COrg_ref());
952 cpy->Assign((*reply_it)->GetData().GetOrg());
953 } else if (use_error_orgrefs &&
954 (*reply_it)->IsError() &&
955 (*reply_it)->GetError().IsSetOrg() &&
956 (*reply_it)->GetError().GetOrg().IsSetTaxname() &&
957 !NStr::Equal((*reply_it)->GetError().GetOrg().GetTaxname(), "Not valid")) {
958 cpy.Reset(new COrg_ref());
959 cpy->Assign((*reply_it)->GetError().GetOrg());
960 }
961 if (cpy) {
962 cpy->CleanForGenBank();
963 if (!cpy->Equals(**org_it)) {
964 (*org_it)->Assign(*cpy);
965 changed = true;
966 }
967 }
968 ++reply_it;
969 ++org_it;
970 }
971 if (reply_it != reply.GetReply().end()) {
972 error_message = "More taxonomy replies than requests!";
973 } else if (org_it != org_refs.end()) {
974 error_message = "Not enough taxonomy replies!";
975 }
976 return changed;
977 }
978 //LCOV_EXCL_STOP
979
980
GetSpecificHostLookupRequest(bool for_fix)981 vector<CRef<COrg_ref> > CTaxValidationAndCleanup::GetSpecificHostLookupRequest(bool for_fix)
982 {
983 if (for_fix) {
984 if (!m_HostMapForFix.IsPopulated()) {
985 x_CreateQualifierMap(m_HostMapForFix);
986 }
987 return m_HostMapForFix.GetRequestList();
988 } else {
989 if (!m_HostMap.IsPopulated()) {
990 x_CreateQualifierMap(m_HostMap);
991 }
992 return m_HostMap.GetRequestList();
993 }
994 }
995
GetStrainLookupRequest()996 vector<CRef<COrg_ref> > CTaxValidationAndCleanup::GetStrainLookupRequest()
997 {
998 if (!m_StrainRequestsBuilt) {
999 x_CreateStrainMap();
1000 }
1001
1002 vector<CRef<COrg_ref> > org_rq_list = m_StrainMap.GetRequestList();
1003 return org_rq_list;
1004 }
1005
1006
x_CreateQualifierMap(CQualLookupMap & lookup)1007 void CTaxValidationAndCleanup::x_CreateQualifierMap(CQualLookupMap& lookup)
1008 {
1009 //first do descriptors
1010 vector<CConstRef<CSeqdesc> >::const_iterator desc_it = m_SrcDescs.begin();
1011 vector<CConstRef<CSeq_entry> >::const_iterator ctx_it = m_DescCtxs.begin();
1012 while (desc_it != m_SrcDescs.end() && ctx_it != m_DescCtxs.end()) {
1013 lookup.AddDesc(*desc_it, *ctx_it);
1014 ++desc_it;
1015 ++ctx_it;
1016 }
1017 // collect features with specific hosts
1018 vector<CConstRef<CSeq_feat> >::const_iterator feat_it = m_SrcFeats.begin();
1019 while (feat_it != m_SrcFeats.end()) {
1020 lookup.AddFeat(*feat_it);
1021 ++feat_it;
1022 }
1023
1024 }
1025
1026
x_CreateStrainMap()1027 void CTaxValidationAndCleanup::x_CreateStrainMap()
1028 {
1029 x_CreateQualifierMap(m_StrainMap);
1030 m_StrainRequestsBuilt = true;
1031 }
1032
1033
ReportSpecificHostErrors(CValidError_imp & imp)1034 void CTaxValidationAndCleanup::ReportSpecificHostErrors(CValidError_imp& imp)
1035 {
1036 m_HostMap.PostErrors(imp);
1037 }
1038
1039 //LCOV_EXCL_START
1040 //appears to not be used
ReportSpecificHostErrors(const CTaxon3_reply & reply,CValidError_imp & imp)1041 void CTaxValidationAndCleanup::ReportSpecificHostErrors(const CTaxon3_reply& reply, CValidError_imp& imp)
1042 {
1043 string error_message;
1044 if (!m_HostMap.IsUpdateComplete()) {
1045 vector<CRef<COrg_ref> > input = m_HostMap.GetRequestList();
1046 error_message = m_HostMap.IncrementalUpdate(input, reply);
1047 }
1048 if (!NStr::IsBlank(error_message)) {
1049 imp.PostErr(eDiag_Error, eErr_SEQ_DESCR_TaxonomyLookupProblem, error_message, *(GetTopReportObject()));
1050 return;
1051 }
1052
1053 m_HostMap.PostErrors(imp);
1054 }
1055 //LCOV_EXCL_STOP
1056
1057
1058 //LCOV_EXCL_START
1059 //only used by cleanup
AdjustOrgRefsWithSpecificHostReply(vector<CRef<COrg_ref>> requests,const CTaxon3_reply & reply,vector<CRef<COrg_ref>> org_refs,string & error_message)1060 bool CTaxValidationAndCleanup::AdjustOrgRefsWithSpecificHostReply
1061 (vector<CRef<COrg_ref> > requests,
1062 const CTaxon3_reply& reply,
1063 vector<CRef<COrg_ref> > org_refs,
1064 string& error_message)
1065 {
1066 if (!m_HostMapForFix.IsUpdateComplete()) {
1067 // need to calculate requests for this list
1068 m_HostMapForFix.IncrementalUpdate(requests, reply);
1069 }
1070 return AdjustOrgRefsForSpecificHosts(org_refs);
1071 }
1072
1073
AdjustOrgRefsForSpecificHosts(vector<CRef<COrg_ref>> org_refs)1074 bool CTaxValidationAndCleanup::AdjustOrgRefsForSpecificHosts(vector<CRef<COrg_ref> > org_refs)
1075 {
1076 bool changed = false;
1077 for (auto org = org_refs.begin(); org != org_refs.end(); org++) {
1078 changed |= m_HostMapForFix.ApplyToOrg(**org);
1079 }
1080 return changed;
1081 }
1082
1083
x_FindHostFixRequest(const string & val)1084 TSpecificHostRequests::iterator CTaxValidationAndCleanup::x_FindHostFixRequest(const string& val)
1085 {
1086 TSpecificHostRequests::iterator map_it = m_SpecificHostRequests.find(val);
1087 if (map_it != m_SpecificHostRequests.end() && map_it->second.NumRemainingReplies() > 0) {
1088 return map_it;
1089 }
1090 map_it = m_SpecificHostRequests.begin();
1091 while (map_it != m_SpecificHostRequests.end()) {
1092 if (map_it->second.MatchTryValue(val) && map_it->second.NumRemainingReplies() > 0) {
1093 return map_it;
1094 }
1095 ++map_it;
1096 }
1097 return m_SpecificHostRequests.end();
1098 }
1099 //LCOV_EXCL_STOP
1100
1101
IncrementalSpecificHostMapUpdate(const vector<CRef<COrg_ref>> & input,const CTaxon3_reply & reply)1102 string CTaxValidationAndCleanup::IncrementalSpecificHostMapUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply)
1103 {
1104 string error_message;
1105 if (m_HostMap.IsPopulated()) {
1106 error_message = m_HostMap.IncrementalUpdate(input, reply);
1107 }
1108 if (NStr::IsBlank(error_message)) {
1109 if (m_HostMapForFix.IsPopulated()) {
1110 error_message = m_HostMapForFix.IncrementalUpdate(input, reply);
1111 }
1112 }
1113 return error_message;
1114 }
1115
1116
1117 //LCOV_EXCL_START
1118 //used only by cleanup
IsSpecificHostMapUpdateComplete() const1119 bool CTaxValidationAndCleanup::IsSpecificHostMapUpdateComplete() const
1120 {
1121 if (m_HostMap.IsPopulated()) {
1122 return m_HostMap.IsUpdateComplete();
1123 } else if (m_HostMapForFix.IsPopulated()) {
1124 return m_HostMapForFix.IsUpdateComplete();
1125 } else {
1126 return false;
1127 }
1128 }
1129
1130
x_UpdateSpecificHostMapWithReply(const CTaxon3_reply & reply,string & error_message)1131 void CTaxValidationAndCleanup::x_UpdateSpecificHostMapWithReply(const CTaxon3_reply& reply, string& error_message)
1132 {
1133 CTaxon3_reply::TReply::const_iterator reply_it = reply.GetReply().begin();
1134 TSpecificHostRequests::iterator rq_it = m_SpecificHostRequests.begin();
1135 while (rq_it != m_SpecificHostRequests.end()) {
1136 while (rq_it->second.NumRemainingReplies() > 0 && reply_it != reply.GetReply().end()) {
1137 rq_it->second.AddReply(**reply_it);
1138 ++reply_it;
1139 }
1140 if (rq_it->second.NumRemainingReplies() > 0) {
1141 error_message = "Failed to respond to all taxonomy requests for specific host";
1142 break;
1143 }
1144 ++rq_it;
1145 }
1146
1147 if (reply_it != reply.GetReply().end()) {
1148 error_message = "Unexpected taxonomy responses for specific host";
1149 }
1150 }
1151
1152
x_ApplySpecificHostMap(COrg_ref & org_ref) const1153 bool CTaxValidationAndCleanup::x_ApplySpecificHostMap(COrg_ref& org_ref) const
1154 {
1155 if (!org_ref.IsSetOrgname() ||
1156 !org_ref.GetOrgname().IsSetMod()) {
1157 return false;
1158 }
1159
1160 bool changed = false;
1161
1162 for (auto m = org_ref.SetOrgname().SetMod().begin(); m != org_ref.SetOrgname().SetMod().end(); m++) {
1163 if ((*m)->IsSetSubtype() &&
1164 (*m)->GetSubtype() == COrgMod::eSubtype_nat_host &&
1165 (*m)->IsSetSubname()) {
1166 string host_val = x_DefaultSpecificHostAdjustments((*m)->GetSubname());
1167 TSpecificHostRequests::const_iterator it = m_SpecificHostRequests.find(host_val);
1168 if (it != m_SpecificHostRequests.end()) {
1169 const string& new_val = it->second.SuggestFix();
1170 if (!NStr::IsBlank(new_val) && !NStr::Equal(new_val, (*m)->GetSubname())) {
1171 (*m)->SetSubname(new_val);
1172 changed = true;
1173 }
1174 }
1175 }
1176 }
1177
1178 return changed;
1179 }
1180
1181
x_DefaultSpecificHostAdjustments(const string & host_val)1182 string CTaxValidationAndCleanup::x_DefaultSpecificHostAdjustments(const string& host_val)
1183 {
1184 string adjusted = host_val;
1185 NStr::TruncateSpacesInPlace(adjusted);
1186 adjusted = COrgMod::FixHost(adjusted);
1187 return adjusted;
1188 }
1189
1190
IncrementalStrainMapUpdate(const vector<CRef<COrg_ref>> & input,const CTaxon3_reply & reply)1191 string CTaxValidationAndCleanup::IncrementalStrainMapUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply)
1192 {
1193 return m_StrainMap.IncrementalUpdate(input, reply);
1194 }
1195
1196
IsStrainMapUpdateComplete() const1197 bool CTaxValidationAndCleanup::IsStrainMapUpdateComplete() const
1198 {
1199 return m_StrainMap.IsUpdateComplete();
1200 }
1201 //LCOV_EXCL_STOP
1202
1203
ReportStrainErrors(CValidError_imp & imp)1204 void CTaxValidationAndCleanup::ReportStrainErrors(CValidError_imp& imp)
1205 {
1206 m_StrainMap.PostErrors(imp);
1207 }
1208
GetSeqContext(size_t num) const1209 CConstRef<CSeq_entry> CTaxValidationAndCleanup::GetSeqContext(size_t num) const
1210 {
1211 return (num < m_DescCtxs.size()) ? m_DescCtxs[num] : CConstRef<CSeq_entry>();
1212 }
1213
1214
1215 //LCOV_EXCL_START
1216 //used by Genome Workbench, asn_cleanup, and table2asn but not asnvalidate
DoTaxonomyUpdate(CSeq_entry_Handle seh,bool with_host)1217 bool CTaxValidationAndCleanup::DoTaxonomyUpdate(CSeq_entry_Handle seh, bool with_host)
1218 {
1219 Init(*(seh.GetCompleteSeq_entry()));
1220
1221 vector<CRef<COrg_ref> > original_orgs = GetTaxonomyLookupRequest();
1222 if (original_orgs.empty())
1223 {
1224 return false;
1225 }
1226 const size_t chunk_size = 1000;
1227 vector< CRef<COrg_ref> > edited_orgs;
1228
1229 CTaxon3 taxon3;
1230 taxon3.Init();
1231 size_t i = 0;
1232 while (i < original_orgs.size())
1233 {
1234 size_t len = min(chunk_size, original_orgs.size() - i);
1235 vector< CRef<COrg_ref> > tmp_original_orgs(original_orgs.begin() + i, original_orgs.begin() + i + len);
1236 vector< CRef<COrg_ref> > tmp_edited_orgs;
1237 ITERATE(vector<CRef<COrg_ref> >, it, tmp_original_orgs)
1238 {
1239 CRef<COrg_ref> cpy(new COrg_ref());
1240 cpy->Assign(**it);
1241 tmp_edited_orgs.push_back(cpy);
1242 }
1243 CRef<CTaxon3_reply> tmp_lookup_reply = taxon3.SendOrgRefList(tmp_original_orgs);
1244 string error_message;
1245 AdjustOrgRefsWithTaxLookupReply(*tmp_lookup_reply, tmp_edited_orgs, error_message);
1246 if (!NStr::IsBlank(error_message))
1247 {
1248 // post error message
1249 ERR_POST(Error << error_message);
1250 return false;
1251 }
1252 edited_orgs.insert(edited_orgs.end(), tmp_edited_orgs.begin(), tmp_edited_orgs.end());
1253 i += len;
1254 }
1255
1256 if (with_host) {
1257 vector< CRef<COrg_ref> > spec_host_rq = GetSpecificHostLookupRequest(true);
1258 i = 0;
1259 while (i < spec_host_rq.size())
1260 {
1261 size_t len = min(chunk_size, spec_host_rq.size() - i);
1262 vector< CRef<COrg_ref> > tmp_spec_host_rq(spec_host_rq.begin() + i, spec_host_rq.begin() + i + len);
1263 CRef<CTaxon3_reply> tmp_spec_host_reply = taxon3.SendOrgRefList(tmp_spec_host_rq);
1264 string error_message = IncrementalSpecificHostMapUpdate(tmp_spec_host_rq, *tmp_spec_host_reply);
1265 if (!NStr::IsBlank(error_message))
1266 {
1267 // post error message
1268 ERR_POST(Error << error_message);
1269 return false;
1270 }
1271 i += len;
1272 }
1273
1274 AdjustOrgRefsForSpecificHosts(edited_orgs);
1275 }
1276
1277 // update descriptors
1278 size_t num_descs = NumDescs();
1279 size_t num_updated_descs = 0;
1280 for (size_t n = 0; n < num_descs; n++) {
1281 if (!original_orgs[n]->Equals(*(edited_orgs[n]))) {
1282 CSeqdesc* orig = const_cast<CSeqdesc *>(GetDesc(n).GetPointer());
1283 orig->SetSource().SetOrg().Assign(*(edited_orgs[n]));
1284 num_updated_descs++;
1285 }
1286 }
1287
1288 // now update features
1289 size_t num_updated_feats = 0;
1290 for (size_t n = 0; n < NumFeats(); n++) {
1291 if (!original_orgs[n + num_descs]->Equals(*edited_orgs[n + num_descs])) {
1292 CConstRef<CSeq_feat> feat = GetFeat(n);
1293 CRef<CSeq_feat> new_feat(new CSeq_feat());
1294 new_feat->Assign(*feat);
1295 new_feat->SetData().SetBiosrc().SetOrg().Assign(*(edited_orgs[n + num_descs]));
1296
1297 CSeq_feat_Handle fh = seh.GetScope().GetSeq_featHandle(*feat);
1298 CSeq_feat_EditHandle efh(fh);
1299 efh.Replace(*new_feat);
1300 num_updated_feats++;
1301 }
1302 }
1303 return (num_updated_descs > 0 || num_updated_feats > 0);
1304 }
1305 //LCOV_EXCL_STOP
1306
1307
1308 //LCOV_EXCL_START
1309 //only used by biosample
FixOneSpecificHost(string & val)1310 void CTaxValidationAndCleanup::FixOneSpecificHost(string& val)
1311 {
1312 val = x_DefaultSpecificHostAdjustments(val);
1313 string err_msg;
1314 if(IsOneSpecificHostValid(val, err_msg)) {
1315 return;
1316 }
1317 m_HostMapForFix.Clear();
1318 m_HostMapForFix.AddString(val);
1319
1320 vector< CRef<COrg_ref> > spec_host_rq = m_HostMapForFix.GetRequestList();
1321 if (spec_host_rq.empty()) {
1322 m_HostMapForFix.Clear();
1323 return;
1324 }
1325 vector< CRef<COrg_ref> > edited;
1326 edited.push_back(CRef<COrg_ref>(new COrg_ref()));
1327 edited.front()->SetOrgname().SetMod().push_back(CRef<COrgMod>(new COrgMod(COrgMod::eSubtype_nat_host, val)));
1328
1329 CTaxon3 taxon3;
1330 taxon3.Init();
1331 CRef<CTaxon3_reply> tmp_spec_host_reply = taxon3.SendOrgRefList(spec_host_rq);
1332
1333 if (!tmp_spec_host_reply->IsSetReply() || !tmp_spec_host_reply->GetReply().front()->IsData()) {
1334 val = kEmptyStr;
1335 m_HostMapForFix.Clear();
1336 return;
1337 }
1338
1339 string error_message = IncrementalSpecificHostMapUpdate(spec_host_rq, *tmp_spec_host_reply);
1340 if (!NStr::IsBlank(error_message))
1341 {
1342 // post error message
1343 ERR_POST(Error << error_message);
1344 }
1345
1346
1347 AdjustOrgRefsForSpecificHosts(edited);
1348
1349 val = edited.front()->GetOrgname().GetMod().front()->GetSubname();
1350 m_HostMapForFix.Clear();
1351 }
1352 //LCOV_EXCL_STOP
1353
1354
1355 //LCOV_EXCL_START
1356 //only used by biosample
IsOneSpecificHostValid(const string & val,string & error_msg)1357 bool CTaxValidationAndCleanup::IsOneSpecificHostValid(const string& val, string& error_msg)
1358 {
1359 error_msg = kEmptyStr;
1360 m_HostMap.Clear();
1361
1362 m_HostMap.AddString(val);
1363
1364 vector< CRef<COrg_ref> > spec_host_rq = m_HostMap.GetRequestList();
1365 if (spec_host_rq.empty()) {
1366 m_HostMap.Clear();
1367 return true;
1368 }
1369
1370 CTaxon3 taxon3;
1371 taxon3.Init();
1372 CRef<CTaxon3_reply> tmp_spec_host_reply = taxon3.SendOrgRefList(spec_host_rq);
1373
1374 string err_msg;
1375 if (tmp_spec_host_reply) {
1376 err_msg = IncrementalSpecificHostMapUpdate(spec_host_rq, *tmp_spec_host_reply);
1377 } else {
1378 err_msg = "Connection to taxonomy failed";
1379 }
1380 bool rval = true;
1381 error_msg = err_msg;
1382
1383 if (!NStr::IsBlank(err_msg)) {
1384 ERR_POST(Error << err_msg);
1385 m_HostMap.Clear();
1386 rval = false;
1387 } else {
1388 vector<TTaxError> errs;
1389 m_HostMap.ListErrors(errs);
1390 if (errs.size() > 0) {
1391 error_msg = errs.front().err_msg;
1392 rval = false;
1393 }
1394 }
1395 m_HostMap.Clear();
1396 return rval;
1397 }
1398 //LCOV_EXCL_STOP
1399
1400
CheckOneOrg(const COrg_ref & org,int genome,CValidError_imp & imp)1401 void CTaxValidationAndCleanup::CheckOneOrg(const COrg_ref& org, int genome, CValidError_imp& imp)
1402 {
1403 x_ClearMaps();
1404
1405 vector<TTaxError> errs;
1406 CTaxon3 taxon3;
1407 taxon3.Init();
1408
1409 // lookup of whole org
1410 vector< CRef<COrg_ref> > org_rq_list;
1411 CRef<COrg_ref> rq(new COrg_ref);
1412 rq->Assign(org);
1413 org_rq_list.push_back(rq);
1414
1415 CRef<CTaxon3_reply> reply = taxon3.SendOrgRefList(org_rq_list);
1416
1417 if (!reply || !reply->IsSetReply()) {
1418 imp.PostErr(eDiag_Error, eErr_SEQ_DESCR_TaxonomyServiceProblem,
1419 "Taxonomy service connection failure", org);
1420 } else {
1421 ListTaxLookupErrors(*(reply->GetReply().front()), org, genome,
1422 false, false, errs);
1423 }
1424
1425 // Now look at specific-host values
1426 m_HostMap.AddOrg(org);
1427 org_rq_list = GetSpecificHostLookupRequest(false);
1428
1429 if (!org_rq_list.empty()) {
1430 reply = taxon3.SendOrgRefList(org_rq_list);
1431 string err_msg;
1432 if (reply) {
1433 err_msg = IncrementalSpecificHostMapUpdate(org_rq_list, *reply);
1434 } else {
1435 err_msg = "Connection to taxonomy failed";
1436 }
1437 if (!NStr::IsBlank(err_msg)) {
1438 imp.PostErr(eDiag_Error, eErr_SEQ_DESCR_TaxonomyLookupProblem, err_msg, org);
1439 } else {
1440 m_HostMap.ListErrors(errs);
1441 }
1442 }
1443
1444
1445 // validate strain
1446 m_StrainMap.AddOrg(org);
1447 org_rq_list = GetStrainLookupRequest();
1448 if (!org_rq_list.empty()) {
1449 reply = taxon3.SendOrgRefList(org_rq_list);
1450 string err_msg = IncrementalStrainMapUpdate(org_rq_list, *reply);
1451 if (!NStr::IsBlank(err_msg)) {
1452 imp.PostErr(eDiag_Error, eErr_SEQ_DESCR_TaxonomyLookupProblem, err_msg, org);
1453 } else {
1454 m_StrainMap.ListErrors(errs);
1455 }
1456 }
1457
1458 for (auto it : errs) {
1459 imp.PostObjErr(it.severity, it.err_type, it.err_msg, org);
1460 }
1461 }
1462
1463
1464 END_SCOPE(validator)
1465 END_SCOPE(objects)
1466 END_NCBI_SCOPE
1467