1 /*  $Id: tax_validation_and_cleanup.cpp 636458 2021-08-24 17:53:54Z fukanchi $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Colleen Bollin
27  *
28  * File Description:
29  *   Tools for batch processing taxonomy-related validation and cleanup
30  *   .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <objmgr/object_manager.hpp>
37 
38 #include <serial/iterator.hpp>
39 
40 #include <objects/seqfeat/BioSource.hpp>
41 #include <objects/seqfeat/OrgMod.hpp>
42 #include <objects/seqfeat/OrgName.hpp>
43 #include <objects/seqfeat/Org_ref.hpp>
44 #include <objects/seqfeat/Seq_feat.hpp>
45 #include <objects/seqfeat/SubSource.hpp>
46 
47 #include <objmgr/bioseq_ci.hpp>
48 #include <objmgr/seqdesc_ci.hpp>
49 #include <objmgr/util/feature.hpp>
50 
51 #include <objmgr/feat_ci.hpp>
52 #include <objmgr/scope.hpp>
53 
54 #include <objects/taxon3/taxon3.hpp>
55 #include <objects/taxon3/Taxon3_reply.hpp>
56 
57 #include <objtools/validator/validator.hpp>
58 #include <objtools/validator/validerror_imp.hpp>
59 #include <objtools/validator/tax_validation_and_cleanup.hpp>
60 #include <objtools/validator/utilities.hpp>
61 
62 #define NCBI_USE_ERRCODE_X   Objtools_Validator
63 
64 BEGIN_NCBI_SCOPE
65 BEGIN_SCOPE(objects)
66 BEGIN_SCOPE(validator)
67 using namespace sequence;
68 
69 const string kInvalidReplyMsg = "Taxonomy service returned invalid reply";
70 
71 
CQualifierRequest()72 CQualifierRequest::CQualifierRequest()
73 {
74     x_Init();
75 }
76 
77 
x_Init()78 void CQualifierRequest::x_Init()
79 {
80     m_ValuesToTry.clear();
81     m_RepliesProcessed = 0;
82     m_Descs.clear();
83     m_Feats.clear();
84 }
85 
86 
AddParent(CConstRef<CSeqdesc> desc,CConstRef<CSeq_entry> ctx)87 void CQualifierRequest::AddParent(CConstRef<CSeqdesc> desc, CConstRef<CSeq_entry> ctx)
88 {
89     m_Descs.push_back(TDescPair(desc, ctx));
90 }
91 
92 
AddParent(CConstRef<CSeq_feat> feat)93 void CQualifierRequest::AddParent(CConstRef<CSeq_feat> feat)
94 {
95     m_Feats.push_back(feat);
96 }
97 
98 
AddRequests(vector<CRef<COrg_ref>> & request_list) const99 void CQualifierRequest::AddRequests(vector<CRef<COrg_ref> >& request_list) const
100 {
101     for (auto it = m_ValuesToTry.begin(); it != m_ValuesToTry.end(); it++) {
102         CRef<COrg_ref> rq(new COrg_ref);
103         rq->SetTaxname(*it);
104         request_list.push_back(rq);
105     }
106 }
107 
108 
MatchTryValue(const string & val) const109 bool CQualifierRequest::MatchTryValue(const string& val) const
110 {
111     for (auto it = m_ValuesToTry.begin(); it != m_ValuesToTry.end(); it++) {
112         if (NStr::EqualNocase(val, *it)) {
113             return true;
114         }
115     }
116     return false;
117 }
118 
119 
PostErrors(CValidError_imp & imp)120 void CQualifierRequest::PostErrors(CValidError_imp& imp)
121 {
122     vector<TTaxError> errs;
123     ListErrors(errs);
124     for (auto e : errs) {
125         for (auto it = m_Descs.begin(); it != m_Descs.end(); it++) {
126             imp.PostObjErr(e.severity, e.err_type, e.err_msg, *(it->first), it->second);
127         }
128         for (auto it = m_Feats.begin(); it != m_Feats.end(); it++) {
129             imp.PostObjErr(e.severity, e.err_type, e.err_msg, **it);
130         }
131     }
132 }
133 
134 
CSpecificHostRequest(const string & host,const COrg_ref & org,bool for_fix)135 CSpecificHostRequest::CSpecificHostRequest(const string& host, const COrg_ref& org, bool for_fix) :
136     CQualifierRequest(),
137     m_Host(host),
138     m_Response(eUnrecognized),
139     m_HostLineage(kEmptyStr),
140     m_OrgLineage(kEmptyStr)
141 {
142     string host_check = SpecificHostValueToCheck(host);
143     if (NStr::IsBlank(host_check)) {
144         m_Response = eNormal;
145         return;
146     }
147     if (!for_fix && !NStr::Equal(host, host_check)) {
148         m_ValuesToTry.push_back(host_check);
149     }
150     m_ValuesToTry.push_back(host);
151 
152     m_SuggestedFix.clear();
153     if (org.IsSetLineage()) {
154         m_OrgLineage = org.GetLineage();
155     }
156 }
157 
158 
AddReply(const CT3Reply & reply)159 void CSpecificHostRequest::AddReply(const CT3Reply& reply)
160 {
161     if (m_Response == eAmbiguous) {
162         string new_error = InterpretSpecificHostResult(m_ValuesToTry[m_RepliesProcessed], reply, m_Host);
163         if (NStr::IsBlank(new_error)) {
164             m_Response = eNormal;
165             m_SuggestedFix = m_Host;
166             m_HostLineage = reply.GetData().GetOrg().IsSetLineage() ? reply.GetData().GetOrg().GetLineage() : kEmptyStr;
167             m_Error = kEmptyStr;
168         }
169     } else if (m_Response == eUnrecognized) {
170         m_Error = InterpretSpecificHostResult(m_ValuesToTry[m_RepliesProcessed], reply, m_Host);
171         if (NStr::IsBlank(m_Error)) {
172             m_Response = eNormal;
173             m_SuggestedFix = m_Host;
174             m_HostLineage = reply.GetData().GetOrg().IsSetLineage() ? reply.GetData().GetOrg().GetLineage() : kEmptyStr;
175         } else if (NStr::Find(m_Error, "ambiguous") != NPOS) {
176             m_Response = eAmbiguous;
177         } else if (NStr::StartsWith(m_Error, "Invalid value for specific host") && !IsLikelyTaxname(m_Host)) {
178             m_Response = eNormal;
179             m_SuggestedFix = m_Host;
180         } else if (NStr::StartsWith(m_Error, "Specific host value is alternate name")) {
181             m_Response = eAlternateName;
182             m_SuggestedFix = reply.GetData().GetOrg().GetTaxname();
183             m_HostLineage = reply.GetData().GetOrg().IsSetLineage() ? reply.GetData().GetOrg().GetLineage() : kEmptyStr;
184         } else {
185             m_Response = eUnrecognized;
186             if (NStr::IsBlank(m_SuggestedFix) && reply.IsData() && reply.GetData().IsSetOrg()) {
187                 if (HasMisSpellFlag(reply.GetData())) {
188                     m_SuggestedFix = reply.GetData().GetOrg().GetTaxname();
189                     m_HostLineage = reply.GetData().GetOrg().IsSetLineage() ? reply.GetData().GetOrg().GetLineage() : kEmptyStr;
190                 } else if (!FindMatchInOrgRef(m_Host, reply.GetData().GetOrg())
191                         && !IsCommonName(reply.GetData())) {
192                     m_SuggestedFix = reply.GetData().GetOrg().GetTaxname();
193                     m_HostLineage = reply.GetData().GetOrg().IsSetLineage() ? reply.GetData().GetOrg().GetLineage() : kEmptyStr;
194                 }
195             }
196         }
197     }
198     m_RepliesProcessed++;
199 }
200 
201 
ListErrors(vector<TTaxError> & errs) const202 void CSpecificHostRequest::ListErrors(vector<TTaxError>& errs) const
203 {
204     switch (m_Response) {
205         case eNormal:
206             break;
207         case eAmbiguous:
208             errs.push_back(TTaxError{ eDiag_Info, eErr_SEQ_DESCR_AmbiguousSpecificHost, m_Error });
209             break;
210         case eUnrecognized:
211             errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_BadSpecificHost, m_Error });
212             break;
213         case eAlternateName:
214             errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_BadSpecificHost, m_Error });
215             break;
216     }
217 
218     if (!NStr::IsBlank(m_HostLineage) && !NStr::IsBlank(m_OrgLineage) &&
219         (NStr::Find(m_OrgLineage, "Streptophyta") != NPOS || NStr::Find(m_OrgLineage, "Metazoa") != NPOS) &&
220         (NStr::Find(m_HostLineage, "Fungi;") != NPOS || NStr::Find(m_HostLineage, "Bacteria") != NPOS ||
221         NStr::Find(m_HostLineage, "Archaea") != NPOS || NStr::Find(m_HostLineage, "Viruses") != NPOS)) {
222         errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_BadSpecificHost,
223             "Suspect Host Value - a prokaryote, fungus or virus is suspect as a host for a plant or animal" });
224     }
225 }
226 
227 
228 //LCOV_EXCL_START
229 //used by cleanup
SuggestFix() const230 const string& CSpecificHostRequest::SuggestFix() const
231 {
232     if (m_ValuesToTry.empty()) {
233         return m_Host;
234     } else {
235         return m_SuggestedFix;
236     }
237 }
238 //LCOV_EXCL_STOP
239 
240 
x_IgnoreStrain(const string & str)241 bool CStrainRequest::x_IgnoreStrain(const string& str)
242 {
243     // per VR-762, ignore strain if combination of letters and numbers
244     bool has_number = false;
245     bool has_letter = false;
246     for (size_t i = 0; i < str.length(); i++) {
247         char ch = str.c_str()[i];
248         if (isdigit(ch)) {
249             has_number = true;
250         } else if (isalpha(ch)) {
251             has_letter = true;
252         } else {
253             return false;
254         }
255     }
256     if (!has_number || !has_letter) {
257         return false;
258     } else {
259         return true;
260     }
261 }
262 
263 
CStrainRequest(const string & strain,const COrg_ref & org)264 CStrainRequest::CStrainRequest(const string& strain, const COrg_ref& org) : CQualifierRequest(), m_Strain(strain)
265 {
266     if (org.IsSetTaxname()) {
267         m_Taxname = org.GetTaxname();
268     } else {
269         m_Taxname.clear();
270     }
271 
272     m_IsInvalid = false;
273     if (NStr::IsBlank(strain) || x_IgnoreStrain(strain)) {
274         return;
275     }
276 
277     m_ValuesToTry.push_back(strain);
278     size_t pos = 0;
279     while (strain[pos] != 0 && isalpha(strain[pos])) {
280         ++pos;
281     }
282     if (pos < strain.length() && pos >= 5) {
283         m_ValuesToTry.push_back(strain.substr(0, pos));
284     }
285 
286     if (RequireTaxname(m_Taxname)) {
287         m_ValuesToTry.push_back(MakeKey(strain, m_Taxname));
288     }
289 }
290 
291 
MakeKey(const string & strain,const string & taxname)292 string CStrainRequest::MakeKey(const string& strain, const string& taxname)
293 {
294     if (RequireTaxname(taxname)) {
295         return taxname.substr(0, taxname.length() - 3) + strain;
296     } else {
297         return strain;
298     }
299 }
300 
301 
RequireTaxname(const string & taxname)302 bool CStrainRequest::RequireTaxname(const string& taxname)
303 {
304     if (NStr::EndsWith(taxname, " sp.")) {
305         return true;
306     } else {
307         return false;
308     }
309 }
310 
311 
x_IsUnwanted(const string & str)312 bool CStrainRequest::x_IsUnwanted(const string& str)
313 {
314     if (NStr::FindNoCase(str, "virus") != NPOS ||
315         NStr::FindNoCase(str, "viroid") != NPOS ||
316         NStr::FindNoCase(str, "vector") != NPOS ||
317         NStr::FindNoCase(str, "phage") != NPOS) {
318         return true;
319     } else {
320         return false;
321     }
322 }
323 
324 
Check(const COrg_ref & org)325 bool CStrainRequest::Check(const COrg_ref& org)
326 {
327     if (org.IsSetLineage() && x_IsUnwanted(org.GetLineage())) {
328         return false;
329     }
330     if (org.IsSetTaxname() && x_IsUnwanted(org.GetTaxname())) {
331         return false;
332     }
333     if (!org.IsSetOrgMod()) {
334         return false;
335     }
336     for (auto it : org.GetOrgname().GetMod()) {
337         if (it->IsSetSubtype() && it->IsSetSubname() &&
338             it->GetSubtype() == COrgMod::eSubtype_strain) {
339             return true;
340         }
341     }
342     return false;
343 }
344 
345 
ListErrors(vector<TTaxError> & errs) const346 void CStrainRequest::ListErrors(vector<TTaxError>& errs) const
347 {
348     if (m_IsInvalid) {
349         errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_StrainContainsTaxInfo,
350             "Strain '" + m_Strain + "' contains taxonomic name information" });
351     }
352 }
353 
354 
AddReply(const CT3Reply & reply)355 void CStrainRequest::AddReply(const CT3Reply& reply)
356 {
357     if (!m_IsInvalid) {
358         if (reply.IsData() && reply.GetData().IsSetOrg()) {
359             // TODO: if using just a one word input, make sure name is actually in taxname
360             if (m_ValuesToTry[m_RepliesProcessed].length() < m_Strain.length()) {
361                 if (NStr::EqualNocase(m_ValuesToTry[m_RepliesProcessed], reply.GetData().GetOrg().GetTaxname())) {
362                     m_IsInvalid = true;
363                 }
364             } else {
365                 m_IsInvalid = true;
366             }
367         }
368     }
369     m_RepliesProcessed++;
370 }
371 
372 
Clear()373 void CQualLookupMap::Clear()
374 {
375     m_Populated = false;
376     m_Map.clear();
377 }
378 
379 
AddDesc(CConstRef<CSeqdesc> desc,CConstRef<CSeq_entry> ctx)380 void CQualLookupMap::AddDesc(CConstRef<CSeqdesc> desc, CConstRef<CSeq_entry> ctx)
381 {
382     m_Populated = true;
383     if (!desc->IsSource() || !desc->GetSource().IsSetOrg()) {
384         return;
385     }
386     const COrg_ref& org = desc->GetSource().GetOrg();
387     if (!org.IsSetOrgMod()) {
388         return;
389     }
390     if (!Check(org)) {
391         return;
392     }
393     for (auto mod_it = org.GetOrgname().GetMod().begin(); mod_it != org.GetOrgname().GetMod().end(); mod_it++) {
394         if ((*mod_it)->IsSetSubtype()
395             && (*mod_it)->GetSubtype() == m_Subtype
396             && (*mod_it)->IsSetSubname()) {
397             string qual = (*mod_it)->GetSubname();
398             string key = GetKey(qual, org);
399             TQualifierRequests::iterator find = m_Map.find(key);
400             if (find == m_Map.end()) {
401                 m_Map[key] = x_MakeNewRequest(qual, org);
402                 m_Map[key]->AddParent(desc, ctx);
403             } else {
404                 find->second->AddParent(desc, ctx);
405             }
406         }
407     }
408 }
409 
410 
AddFeat(CConstRef<CSeq_feat> feat)411 void CQualLookupMap::AddFeat(CConstRef<CSeq_feat> feat)
412 {
413     m_Populated = true;
414     if (!feat->IsSetData() || !feat->GetData().IsBiosrc() ||
415         !feat->GetData().GetBiosrc().IsSetOrg()) {
416         return;
417     }
418     const COrg_ref& org = feat->GetData().GetBiosrc().GetOrg();
419     if (!org.IsSetOrgMod()) {
420         return;
421     }
422     if (!Check(org)) {
423         return;
424     }
425     for (auto mod_it = org.GetOrgname().GetMod().begin(); mod_it != org.GetOrgname().GetMod().end(); mod_it++) {
426         if ((*mod_it)->IsSetSubtype()
427             && (*mod_it)->GetSubtype() == m_Subtype
428             && (*mod_it)->IsSetSubname()) {
429             string qual = (*mod_it)->GetSubname();
430             string key = GetKey(qual, feat->GetData().GetBiosrc().GetOrg());
431             TQualifierRequests::iterator find = m_Map.find(key);
432             if (find == m_Map.end()) {
433                 m_Map[key] = x_MakeNewRequest(qual, feat->GetData().GetBiosrc().GetOrg());
434                 m_Map[key]->AddParent(feat);
435             } else {
436                 find->second->AddParent(feat);
437             }
438         }
439     }
440 }
441 
442 
AddOrg(const COrg_ref & org)443 void CQualLookupMap::AddOrg(const COrg_ref& org)
444 {
445     m_Populated = true;
446     if (!org.IsSetOrgMod()) {
447         return;
448     }
449     if (!Check(org)) {
450         return;
451     }
452     for (auto mod_it = org.GetOrgname().GetMod().begin(); mod_it != org.GetOrgname().GetMod().end(); mod_it++) {
453         if ((*mod_it)->IsSetSubtype()
454             && (*mod_it)->GetSubtype() == m_Subtype
455             && (*mod_it)->IsSetSubname()) {
456             string qual = (*mod_it)->GetSubname();
457             string key = GetKey(qual, org);
458             TQualifierRequests::iterator find = m_Map.find(key);
459             if (find == m_Map.end()) {
460                 m_Map[key] = x_MakeNewRequest(qual, org);
461             }
462         }
463     }
464 }
465 
466 
467 //LCOV_EXCL_START
468 //only used by biosample
AddString(const string & val)469 void CQualLookupMap::AddString(const string& val)
470 {
471     m_Populated = true;
472     TQualifierRequests::iterator find = m_Map.find(val);
473     if (find == m_Map.end()) {
474         CRef<COrg_ref> org(new COrg_ref());
475         m_Map[val] = x_MakeNewRequest(val, *org);
476     }
477 }
478 //LCOV_EXCL_STOP
479 
480 
GetRequestList()481 vector<CRef<COrg_ref> > CQualLookupMap::GetRequestList()
482 {
483     vector<CRef<COrg_ref> > org_rq_list;
484     org_rq_list.reserve(m_Map.size());
485     for (auto it = m_Map.begin(); it != m_Map.end(); it++) {
486         it->second->AddRequests(org_rq_list);
487     }
488     return org_rq_list;
489 }
490 
491 
x_FindRequest(const string & val)492 CQualLookupMap::TQualifierRequests::iterator CQualLookupMap::x_FindRequest(const string& val)
493 {
494     TQualifierRequests::iterator map_it = m_Map.find(val);
495     if (map_it != m_Map.end() && map_it->second->NumRemainingReplies() > 0) {
496         return map_it;
497     }
498     map_it = m_Map.begin();
499     while (map_it != m_Map.end()) {
500         if (map_it->second->MatchTryValue(val) && map_it->second->NumRemainingReplies() > 0) {
501             return map_it;
502         }
503         ++map_it;
504     }
505     return m_Map.end();
506 }
507 
508 
IncrementalUpdate(const vector<CRef<COrg_ref>> & input,const CTaxon3_reply & reply)509 string CQualLookupMap::IncrementalUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply)
510 {
511     string error_message;
512     CTaxon3_reply::TReply::const_iterator reply_it = reply.GetReply().begin();
513     vector<CRef<COrg_ref> >::const_iterator rq_it = input.begin();
514 
515     while (reply_it != reply.GetReply().end() && rq_it != input.end()) {
516         TQualifierRequests::iterator map_it = x_FindRequest((*rq_it)->GetTaxname());
517         if (map_it == m_Map.end()) {
518             error_message = "Unexpected taxonomy response for " + (*rq_it)->GetTaxname();
519             return error_message;
520         }
521         map_it->second->AddReply(**reply_it);
522         ++rq_it;
523         ++reply_it;
524     }
525 
526     if (reply_it != reply.GetReply().end()) {
527         error_message = "Unexpected taxonomy responses for " + COrgMod::GetSubtypeName(m_Subtype);
528     }
529     return kEmptyStr;
530 }
531 
532 
533 //LCOV_EXCL_START
534 //only used for cleanup
IsUpdateComplete() const535 bool CQualLookupMap::IsUpdateComplete() const
536 {
537     TQualifierRequests::const_iterator rq_it = m_Map.cbegin();
538     while (rq_it != m_Map.cend()) {
539         if (rq_it->second->NumRemainingReplies() > 0) {
540             return false;
541             break;
542         }
543         ++rq_it;
544     }
545     return true;
546 }
547 //LCOV_EXCL_STOP
548 
549 
PostErrors(CValidError_imp & imp)550 void CQualLookupMap::PostErrors(CValidError_imp& imp)
551 {
552     TQualifierRequests::iterator rq_it = m_Map.begin();
553     while (rq_it != m_Map.end()) {
554         rq_it->second->PostErrors(imp);
555         ++rq_it;
556     }
557 }
558 
559 
560 //LCOV_EXCL_START
561 //only used by biosample
ListErrors(vector<TTaxError> & errs) const562 void CQualLookupMap::ListErrors(vector<TTaxError>& errs) const
563 {
564     for (auto rq_it : m_Map) {
565         rq_it.second->ListErrors(errs);
566     }
567 }
568 
569 
570 //LCOV_EXCL_STOP
571 
572 
x_MakeNewRequest(const string & orig_val,const COrg_ref & org)573 CRef<CQualifierRequest> CSpecificHostMap::x_MakeNewRequest(const string& orig_val, const COrg_ref& org)
574 {
575     CRef<CQualifierRequest> rq(new CSpecificHostRequest(orig_val, org));
576     return rq;
577 }
578 
579 
580 //LCOV_EXCL_START
581 //used for cleanup
x_MakeNewRequest(const string & orig_val,const COrg_ref & org)582 CRef<CQualifierRequest> CSpecificHostMapForFix::x_MakeNewRequest(const string& orig_val, const COrg_ref& org)
583 {
584     CRef<CQualifierRequest> rq(new CSpecificHostRequest(orig_val, org, true));
585     return rq;
586 }
587 
588 
x_DefaultSpecificHostAdjustments(const string & host_val)589 string CSpecificHostMapForFix::x_DefaultSpecificHostAdjustments(const string& host_val)
590 {
591     string adjusted = host_val;
592     NStr::TruncateSpacesInPlace(adjusted);
593     adjusted = COrgMod::FixHost(adjusted);
594     return adjusted;
595 }
596 
597 
ApplyToOrg(COrg_ref & org_ref) const598 bool CSpecificHostMapForFix::ApplyToOrg(COrg_ref& org_ref) const
599 {
600     if (!org_ref.IsSetOrgname() ||
601         !org_ref.GetOrgname().IsSetMod()) {
602         return false;
603     }
604 
605     bool changed = false;
606 
607     for (auto m = org_ref.SetOrgname().SetMod().begin(); m != org_ref.SetOrgname().SetMod().end(); m++) {
608         if ((*m)->IsSetSubtype() &&
609             (*m)->GetSubtype() == COrgMod::eSubtype_nat_host &&
610             (*m)->IsSetSubname()) {
611             string host_val = x_DefaultSpecificHostAdjustments((*m)->GetSubname());
612             TQualifierRequests::const_iterator it = m_Map.find(host_val);
613             if (it != m_Map.end()) {
614                 const CSpecificHostRequest* rq = dynamic_cast<const CSpecificHostRequest *>(it->second.GetPointer());
615                 string new_val = x_DefaultSpecificHostAdjustments(rq->SuggestFix());
616                 if (!NStr::IsBlank(new_val) && !NStr::Equal(new_val, (*m)->GetSubname())) {
617                     (*m)->SetSubname(new_val);
618                     changed = true;
619                 }
620             }
621         }
622     }
623 
624     return changed;
625 }
626 //LCOV_EXCL_STOP
627 
628 
x_MakeNewRequest(const string & orig_val,const COrg_ref & org)629 CRef<CQualifierRequest> CStrainMap::x_MakeNewRequest(const string& orig_val, const COrg_ref& org)
630 {
631     CRef<CQualifierRequest> rq(new CStrainRequest(orig_val, org));
632     return rq;
633 }
634 
635 
CTaxValidationAndCleanup()636 CTaxValidationAndCleanup::CTaxValidationAndCleanup()
637 {
638     m_SrcDescs.clear();
639     m_DescCtxs.clear();
640     m_SrcFeats.clear();
641     m_SpecificHostRequests.clear();
642     m_SpecificHostRequestsBuilt = false;
643     m_SpecificHostRequestsUpdated = false;
644     m_StrainRequestsBuilt = false;
645 }
646 
647 
Init(const CSeq_entry & se)648 void CTaxValidationAndCleanup::Init(const CSeq_entry& se)
649 {
650     m_SrcDescs.clear();
651     m_DescCtxs.clear();
652     m_SrcFeats.clear();
653     m_SpecificHostRequests.clear();
654     m_SpecificHostRequestsBuilt = false;
655     m_SpecificHostRequestsUpdated = false;
656     m_StrainRequestsBuilt = false;
657     x_GatherSources(se);
658 }
659 
660 
GetTopReportObject() const661 CConstRef<CSeq_entry> CTaxValidationAndCleanup::GetTopReportObject() const
662 {
663     if (!m_DescCtxs.empty()) {
664         return m_DescCtxs.front();
665     } else {
666         return CConstRef<CSeq_entry>(NULL);
667     }
668 }
669 
670 
x_GatherSources(const CSeq_entry & se)671 void CTaxValidationAndCleanup::x_GatherSources(const CSeq_entry& se)
672 {
673     // get source descriptors
674     FOR_EACH_DESCRIPTOR_ON_SEQENTRY(it, se)
675     {
676         if ((*it)->IsSource() && (*it)->GetSource().IsSetOrg()) {
677             CConstRef<CSeqdesc> desc;
678             desc.Reset(*it);
679             m_SrcDescs.push_back(desc);
680             CConstRef<CSeq_entry> r_se;
681             r_se.Reset(&se);
682             m_DescCtxs.push_back(r_se);
683         }
684     }
685     // also get features
686     FOR_EACH_ANNOT_ON_SEQENTRY(annot_it, se)
687     {
688         FOR_EACH_SEQFEAT_ON_SEQANNOT(feat_it, **annot_it)
689         {
690             if ((*feat_it)->IsSetData() && (*feat_it)->GetData().IsBiosrc()
691                 && (*feat_it)->GetData().GetBiosrc().IsSetOrg()) {
692                 CConstRef<CSeq_feat> feat;
693                 feat.Reset(*feat_it);
694                 m_SrcFeats.push_back(feat);
695             }
696         }
697     }
698 
699     // if set, recurse
700     if (se.IsSet()) {
701         FOR_EACH_SEQENTRY_ON_SEQSET(it, se.GetSet())
702         {
703             x_GatherSources(**it);
704         }
705     }
706 }
707 
708 
GetTaxonomyLookupRequest() const709 vector< CRef<COrg_ref> > CTaxValidationAndCleanup::GetTaxonomyLookupRequest() const
710 {
711     // request list for taxon3
712     vector< CRef<COrg_ref> > org_rq_list;
713 
714     // first do descriptors
715     vector<CConstRef<CSeqdesc> >::const_iterator desc_it = m_SrcDescs.cbegin();
716     vector<CConstRef<CSeq_entry> >::const_iterator ctx_it = m_DescCtxs.cbegin();
717     while (desc_it != m_SrcDescs.cend() && ctx_it != m_DescCtxs.cend()) {
718         CRef<COrg_ref> rq(new COrg_ref);
719         const COrg_ref& org = (*desc_it)->GetSource().GetOrg();
720         rq->Assign(org);
721         org_rq_list.push_back(rq);
722 
723         ++desc_it;
724         ++ctx_it;
725     }
726 
727     // now do features
728     vector<CConstRef<CSeq_feat> >::const_iterator feat_it = m_SrcFeats.cbegin();
729     while (feat_it != m_SrcFeats.cend()) {
730         CRef<COrg_ref> rq(new COrg_ref);
731         const COrg_ref& org = (*feat_it)->GetData().GetBiosrc().GetOrg();
732         rq->Assign(org);
733         org_rq_list.push_back(rq);
734 
735         ++feat_it;
736     }
737     return org_rq_list;
738 }
739 
740 
x_InterpretTaxonomyError(const CT3Error & error,const COrg_ref & org,const EErrType type,vector<TTaxError> & errs) const741 void CTaxValidationAndCleanup::x_InterpretTaxonomyError(const CT3Error& error, const COrg_ref& org, const EErrType type, vector<TTaxError>& errs) const
742 {
743     const string err_str = error.IsSetMessage() ? error.GetMessage() : "?";
744 
745     if (NStr::Equal(err_str, "Organism not found")) {
746         string msg = "Organism not found in taxonomy database";
747         if (error.IsSetOrg() && error.GetOrg().IsSetTaxname() &&
748             !NStr::Equal(error.GetOrg().GetTaxname(), "Not valid") &&
749             (!org.IsSetTaxname() ||
750                 !NStr::Equal(org.GetTaxname(), error.GetOrg().GetTaxname()))) {
751             msg += " (suggested:" + error.GetOrg().GetTaxname() + ")";
752         }
753         errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_OrganismNotFound, msg });
754     } else if (NStr::StartsWith(err_str, "Organism not found. Possible matches")) {
755         errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_OrganismNotFound, err_str });
756     } else if (NStr::Equal(err_str, kInvalidReplyMsg)) {
757         errs.push_back(TTaxError{ eDiag_Error, eErr_SEQ_DESCR_TaxonomyLookupProblem, err_str });
758     } else if (NStr::Find(err_str, "ambiguous name") != NPOS) {
759         errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_TaxonomyAmbiguousName,
760             "Taxonomy lookup failed with message '" + err_str + "'"});
761     } else {
762         errs.push_back(TTaxError{ eDiag_Warning, type,
763             "Taxonomy lookup failed with message '" + err_str + "'" });
764     }
765 }
766 
767 
ListTaxLookupErrors(const CT3Reply & reply,const COrg_ref & org,CBioSource::TGenome genome,bool is_insd_patent,bool is_wp,vector<TTaxError> & errs) const768 void CTaxValidationAndCleanup::ListTaxLookupErrors
769 (const CT3Reply& reply, const COrg_ref& org, CBioSource::TGenome genome, bool is_insd_patent, bool is_wp, vector<TTaxError>& errs) const
770 {
771     if (reply.IsError()) {
772         x_InterpretTaxonomyError(reply.GetError(), org, eErr_SEQ_DESCR_TaxonomyLookupProblem, errs);
773     } else if (reply.IsData()) {
774         bool is_species_level = true;
775         bool is_unidentified = false;
776         bool force_consult = false;
777         bool has_nucleomorphs = false;
778         if (reply.GetData().IsSetOrg()) {
779             const COrg_ref& orp_rep = reply.GetData().GetOrg();
780             if (org.IsSetTaxname() && orp_rep.IsSetTaxname()) {
781                 const string& taxname_req = org.GetTaxname();
782                 const string& taxname_rep = orp_rep.GetTaxname();
783                 if (NStr::Equal(taxname_rep, "unidentified")) {
784                     is_unidentified = true;
785                 }
786                 TTaxId taxid_request = org.GetTaxId();
787                 TTaxId taxid_reply = orp_rep.GetTaxId();
788 
789                 if (taxid_request != ZERO_TAX_ID && taxid_reply != ZERO_TAX_ID && taxid_request != taxid_reply) {
790                     errs.push_back(TTaxError{ eDiag_Error, eErr_SEQ_DESCR_TaxonomyLookupProblem,
791                         "Organism name is '" + taxname_req
792                         + "', taxonomy ID should be '" + NStr::NumericToString(taxid_reply)
793                         + "' but is '" + NStr::NumericToString(taxid_request) + "'" });
794                 }
795             }
796         }
797         reply.GetData().GetTaxFlags(is_species_level, force_consult, has_nucleomorphs);
798         if (!is_species_level && !is_wp) {
799             errs.push_back(TTaxError{ eDiag_Warning, eErr_SEQ_DESCR_TaxonomyIsSpeciesProblem,
800                 "Taxonomy lookup reports is_species_level FALSE"});
801         }
802         if (force_consult) {
803             if (is_insd_patent && is_unidentified) {
804                 force_consult = false;
805             }
806         }
807         if (force_consult) {
808             errs.push_back(TTaxError{eDiag_Warning, eErr_SEQ_DESCR_TaxonomyConsultRequired,
809                 "Taxonomy lookup reports taxonomy consultation needed"});
810         }
811         if (genome == CBioSource::eGenome_nucleomorph
812             && !has_nucleomorphs) {
813             errs.push_back(TTaxError{eDiag_Warning, eErr_SEQ_DESCR_TaxonomyNucleomorphProblem,
814                     "Taxonomy lookup does not have expected nucleomorph flag"});
815         } else if (genome == CBioSource::eGenome_plastid
816             && (!reply.GetData().HasPlastids())) {
817             errs.push_back(TTaxError{eDiag_Warning, eErr_SEQ_DESCR_TaxonomyPlastidsProblem,
818                     "Taxonomy lookup does not have expected plastid flag"});
819         }
820     }
821 
822 }
823 
ReportTaxLookupErrors(const CTaxon3_reply & reply,CValidError_imp & imp,bool is_insd_patent) const824 void CTaxValidationAndCleanup::ReportTaxLookupErrors
825 (const CTaxon3_reply& reply,
826  CValidError_imp& imp,
827  bool is_insd_patent) const
828 {
829     CTaxon3_reply::TReply::const_iterator reply_it = reply.GetReply().begin();
830 
831     // process descriptor responses
832     vector<CConstRef<CSeqdesc> >::const_iterator desc_it = m_SrcDescs.cbegin();
833     vector<CConstRef<CSeq_entry> >::const_iterator ctx_it = m_DescCtxs.cbegin();
834 
835     while (reply_it != reply.GetReply().end()
836         && desc_it != m_SrcDescs.cend()
837         && ctx_it != m_DescCtxs.cend()) {
838         vector<TTaxError> errs;
839         const COrg_ref& orp_req = (*desc_it)->GetSource().GetOrg();
840         ListTaxLookupErrors(**reply_it, orp_req,
841             (*desc_it)->GetSource().IsSetGenome() ? (*desc_it)->GetSource().GetGenome() : CBioSource::eGenome_unknown,
842             is_insd_patent, imp.IsWP(), errs);
843         for (auto it : errs) {
844             imp.PostObjErr(it.severity, it.err_type, it.err_msg, **desc_it, *ctx_it);
845         }
846         ++reply_it;
847         ++desc_it;
848         ++ctx_it;
849     }
850     // process feat responses
851     vector<CConstRef<CSeq_feat> >::const_iterator feat_it = m_SrcFeats.cbegin();
852     while (reply_it != reply.GetReply().cend()
853         && feat_it != m_SrcFeats.end()) {
854         vector<TTaxError> errs;
855         const COrg_ref& orp_req = (*feat_it)->GetData().GetBiosrc().GetOrg();
856         ListTaxLookupErrors(**reply_it, orp_req,
857             (*feat_it)->GetData().GetBiosrc().IsSetGenome() ? (*feat_it)->GetData().GetBiosrc().GetGenome() : CBioSource::eGenome_unknown,
858             is_insd_patent, imp.IsWP(), errs);
859         for (auto it : errs) {
860             imp.PostErr(it.severity, it.err_type, it.err_msg,* *feat_it);
861         }
862         ++reply_it;
863         ++feat_it;
864     }
865 
866 }
867 
868 
ReportIncrementalTaxLookupErrors(const CTaxon3_reply & reply,CValidError_imp & imp,bool is_insd_patent,size_t offset) const869 void CTaxValidationAndCleanup::ReportIncrementalTaxLookupErrors
870 (const CTaxon3_reply& reply,
871     CValidError_imp& imp,
872     bool is_insd_patent,
873     size_t offset) const
874 {
875     // cout << MSerial_AsnText << reply << endl;
876 
877     CTaxon3_reply::TReply::const_iterator reply_it = reply.GetReply().begin();
878 
879     // process descriptor responses
880     vector<CConstRef<CSeqdesc> >::const_iterator desc_it = m_SrcDescs.cbegin();
881     vector<CConstRef<CSeq_entry> >::const_iterator ctx_it = m_DescCtxs.cbegin();
882 
883     size_t skipped = 0;
884     while (skipped < offset
885         && desc_it != m_SrcDescs.cend()
886         && ctx_it != m_DescCtxs.cend()) {
887         ++desc_it;
888         ++ctx_it;
889         skipped++;
890     }
891 
892     while (reply_it != reply.GetReply().end()
893         && desc_it != m_SrcDescs.cend()
894         && ctx_it != m_DescCtxs.cend()) {
895         vector<TTaxError> errs;
896         const COrg_ref& orp_req = (*desc_it)->GetSource().GetOrg();
897         ListTaxLookupErrors(**reply_it, orp_req,
898             (*desc_it)->GetSource().IsSetGenome() ? (*desc_it)->GetSource().GetGenome() : CBioSource::eGenome_unknown,
899             is_insd_patent, imp.IsWP(), errs);
900         for (auto it : errs) {
901             imp.PostObjErr(it.severity, it.err_type, it.err_msg, **desc_it, *ctx_it);
902         }
903         ++reply_it;
904         ++desc_it;
905         ++ctx_it;
906     }
907 
908     if (reply_it == reply.GetReply().end()) {
909         return;
910     }
911     // process feat responses
912     vector<CConstRef<CSeq_feat> >::const_iterator feat_it = m_SrcFeats.cbegin();
913     while (skipped < offset && feat_it != m_SrcFeats.end()) {
914         ++feat_it;
915         skipped++;
916     }
917     while (reply_it != reply.GetReply().cend() &&
918         feat_it != m_SrcFeats.end()) {
919         vector<TTaxError> errs;
920         const COrg_ref& orp_req = (*feat_it)->GetData().GetBiosrc().GetOrg();
921         ListTaxLookupErrors(**reply_it, orp_req,
922             (*feat_it)->GetData().GetBiosrc().IsSetGenome() ? (*feat_it)->GetData().GetBiosrc().GetGenome() : CBioSource::eGenome_unknown,
923             is_insd_patent, imp.IsWP(), errs);
924         for (auto it : errs) {
925             imp.PostErr(it.severity, it.err_type, it.err_msg, **feat_it);
926         }
927         ++reply_it;
928         ++feat_it;
929     }
930 
931 
932 }
933 
934 
935 
936 //LCOV_EXCL_START
937 //used by Genome Workbench
AdjustOrgRefsWithTaxLookupReply(const CTaxon3_reply & reply,vector<CRef<COrg_ref>> org_refs,string & error_message,bool use_error_orgrefs) const938 bool CTaxValidationAndCleanup::AdjustOrgRefsWithTaxLookupReply
939 ( const CTaxon3_reply& reply,
940  vector<CRef<COrg_ref> > org_refs,
941  string& error_message,
942  bool use_error_orgrefs) const
943 {
944     bool changed = false;
945     CTaxon3_reply::TReply::const_iterator reply_it = reply.GetReply().begin();
946     vector<CRef<COrg_ref> >::iterator org_it = org_refs.begin();
947     while (reply_it != reply.GetReply().end() && org_it != org_refs.end()) {
948         CRef<COrg_ref> cpy(NULL);
949         if ((*reply_it)->IsData() &&
950             (*reply_it)->GetData().IsSetOrg()) {
951             cpy.Reset(new COrg_ref());
952             cpy->Assign((*reply_it)->GetData().GetOrg());
953         } else if (use_error_orgrefs &&
954             (*reply_it)->IsError() &&
955             (*reply_it)->GetError().IsSetOrg() &&
956             (*reply_it)->GetError().GetOrg().IsSetTaxname() &&
957             !NStr::Equal((*reply_it)->GetError().GetOrg().GetTaxname(), "Not valid")) {
958             cpy.Reset(new COrg_ref());
959             cpy->Assign((*reply_it)->GetError().GetOrg());
960         }
961         if (cpy) {
962             cpy->CleanForGenBank();
963             if (!cpy->Equals(**org_it)) {
964                 (*org_it)->Assign(*cpy);
965                 changed = true;
966             }
967         }
968         ++reply_it;
969         ++org_it;
970     }
971     if (reply_it != reply.GetReply().end()) {
972         error_message = "More taxonomy replies than requests!";
973     } else if (org_it != org_refs.end()) {
974         error_message = "Not enough taxonomy replies!";
975     }
976     return changed;
977 }
978 //LCOV_EXCL_STOP
979 
980 
GetSpecificHostLookupRequest(bool for_fix)981 vector<CRef<COrg_ref> > CTaxValidationAndCleanup::GetSpecificHostLookupRequest(bool for_fix)
982 {
983     if (for_fix) {
984         if (!m_HostMapForFix.IsPopulated()) {
985             x_CreateQualifierMap(m_HostMapForFix);
986         }
987         return m_HostMapForFix.GetRequestList();
988     } else {
989         if (!m_HostMap.IsPopulated()) {
990             x_CreateQualifierMap(m_HostMap);
991         }
992         return m_HostMap.GetRequestList();
993     }
994 }
995 
GetStrainLookupRequest()996 vector<CRef<COrg_ref> > CTaxValidationAndCleanup::GetStrainLookupRequest()
997 {
998     if (!m_StrainRequestsBuilt) {
999         x_CreateStrainMap();
1000     }
1001 
1002     vector<CRef<COrg_ref> > org_rq_list = m_StrainMap.GetRequestList();
1003     return org_rq_list;
1004 }
1005 
1006 
x_CreateQualifierMap(CQualLookupMap & lookup)1007 void CTaxValidationAndCleanup::x_CreateQualifierMap(CQualLookupMap& lookup)
1008 {
1009     //first do descriptors
1010     vector<CConstRef<CSeqdesc> >::const_iterator desc_it = m_SrcDescs.begin();
1011     vector<CConstRef<CSeq_entry> >::const_iterator ctx_it = m_DescCtxs.begin();
1012     while (desc_it != m_SrcDescs.end() && ctx_it != m_DescCtxs.end()) {
1013         lookup.AddDesc(*desc_it, *ctx_it);
1014         ++desc_it;
1015         ++ctx_it;
1016     }
1017     // collect features with specific hosts
1018     vector<CConstRef<CSeq_feat> >::const_iterator feat_it = m_SrcFeats.begin();
1019     while (feat_it != m_SrcFeats.end()) {
1020         lookup.AddFeat(*feat_it);
1021         ++feat_it;
1022     }
1023 
1024 }
1025 
1026 
x_CreateStrainMap()1027 void CTaxValidationAndCleanup::x_CreateStrainMap()
1028 {
1029     x_CreateQualifierMap(m_StrainMap);
1030     m_StrainRequestsBuilt = true;
1031 }
1032 
1033 
ReportSpecificHostErrors(CValidError_imp & imp)1034 void CTaxValidationAndCleanup::ReportSpecificHostErrors(CValidError_imp& imp)
1035 {
1036     m_HostMap.PostErrors(imp);
1037 }
1038 
1039 //LCOV_EXCL_START
1040 //appears to not be used
ReportSpecificHostErrors(const CTaxon3_reply & reply,CValidError_imp & imp)1041 void CTaxValidationAndCleanup::ReportSpecificHostErrors(const CTaxon3_reply& reply, CValidError_imp& imp)
1042 {
1043     string error_message;
1044     if (!m_HostMap.IsUpdateComplete()) {
1045         vector<CRef<COrg_ref> > input = m_HostMap.GetRequestList();
1046         error_message = m_HostMap.IncrementalUpdate(input, reply);
1047     }
1048     if (!NStr::IsBlank(error_message)) {
1049         imp.PostErr(eDiag_Error, eErr_SEQ_DESCR_TaxonomyLookupProblem, error_message, *(GetTopReportObject()));
1050         return;
1051     }
1052 
1053     m_HostMap.PostErrors(imp);
1054 }
1055 //LCOV_EXCL_STOP
1056 
1057 
1058 //LCOV_EXCL_START
1059 //only used by cleanup
AdjustOrgRefsWithSpecificHostReply(vector<CRef<COrg_ref>> requests,const CTaxon3_reply & reply,vector<CRef<COrg_ref>> org_refs,string & error_message)1060 bool CTaxValidationAndCleanup::AdjustOrgRefsWithSpecificHostReply
1061 (vector<CRef<COrg_ref> > requests,
1062  const CTaxon3_reply& reply,
1063  vector<CRef<COrg_ref> > org_refs,
1064  string& error_message)
1065 {
1066     if (!m_HostMapForFix.IsUpdateComplete()) {
1067         // need to calculate requests for this list
1068         m_HostMapForFix.IncrementalUpdate(requests, reply);
1069     }
1070     return AdjustOrgRefsForSpecificHosts(org_refs);
1071 }
1072 
1073 
AdjustOrgRefsForSpecificHosts(vector<CRef<COrg_ref>> org_refs)1074 bool CTaxValidationAndCleanup::AdjustOrgRefsForSpecificHosts(vector<CRef<COrg_ref> > org_refs)
1075 {
1076     bool changed = false;
1077     for (auto org = org_refs.begin(); org != org_refs.end(); org++) {
1078         changed |= m_HostMapForFix.ApplyToOrg(**org);
1079     }
1080     return changed;
1081 }
1082 
1083 
x_FindHostFixRequest(const string & val)1084 TSpecificHostRequests::iterator CTaxValidationAndCleanup::x_FindHostFixRequest(const string& val)
1085 {
1086     TSpecificHostRequests::iterator map_it = m_SpecificHostRequests.find(val);
1087     if (map_it != m_SpecificHostRequests.end() && map_it->second.NumRemainingReplies() > 0) {
1088         return map_it;
1089     }
1090     map_it = m_SpecificHostRequests.begin();
1091     while (map_it != m_SpecificHostRequests.end()) {
1092         if (map_it->second.MatchTryValue(val) && map_it->second.NumRemainingReplies() > 0) {
1093             return map_it;
1094         }
1095         ++map_it;
1096     }
1097     return m_SpecificHostRequests.end();
1098 }
1099 //LCOV_EXCL_STOP
1100 
1101 
IncrementalSpecificHostMapUpdate(const vector<CRef<COrg_ref>> & input,const CTaxon3_reply & reply)1102 string CTaxValidationAndCleanup::IncrementalSpecificHostMapUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply)
1103 {
1104     string error_message;
1105     if (m_HostMap.IsPopulated()) {
1106         error_message = m_HostMap.IncrementalUpdate(input, reply);
1107     }
1108     if (NStr::IsBlank(error_message)) {
1109         if (m_HostMapForFix.IsPopulated()) {
1110             error_message = m_HostMapForFix.IncrementalUpdate(input, reply);
1111         }
1112     }
1113     return error_message;
1114 }
1115 
1116 
1117 //LCOV_EXCL_START
1118 //used only by cleanup
IsSpecificHostMapUpdateComplete() const1119 bool CTaxValidationAndCleanup::IsSpecificHostMapUpdateComplete() const
1120 {
1121     if (m_HostMap.IsPopulated()) {
1122         return m_HostMap.IsUpdateComplete();
1123     } else if (m_HostMapForFix.IsPopulated()) {
1124         return m_HostMapForFix.IsUpdateComplete();
1125     } else {
1126         return false;
1127     }
1128 }
1129 
1130 
x_UpdateSpecificHostMapWithReply(const CTaxon3_reply & reply,string & error_message)1131 void CTaxValidationAndCleanup::x_UpdateSpecificHostMapWithReply(const CTaxon3_reply& reply, string& error_message)
1132 {
1133     CTaxon3_reply::TReply::const_iterator reply_it = reply.GetReply().begin();
1134     TSpecificHostRequests::iterator rq_it = m_SpecificHostRequests.begin();
1135     while (rq_it != m_SpecificHostRequests.end()) {
1136         while (rq_it->second.NumRemainingReplies() > 0 && reply_it != reply.GetReply().end()) {
1137             rq_it->second.AddReply(**reply_it);
1138             ++reply_it;
1139         }
1140         if (rq_it->second.NumRemainingReplies() > 0) {
1141             error_message = "Failed to respond to all taxonomy requests for specific host";
1142             break;
1143         }
1144         ++rq_it;
1145     }
1146 
1147     if (reply_it != reply.GetReply().end()) {
1148         error_message = "Unexpected taxonomy responses for specific host";
1149     }
1150 }
1151 
1152 
x_ApplySpecificHostMap(COrg_ref & org_ref) const1153 bool CTaxValidationAndCleanup::x_ApplySpecificHostMap(COrg_ref& org_ref) const
1154 {
1155     if (!org_ref.IsSetOrgname() ||
1156         !org_ref.GetOrgname().IsSetMod()) {
1157         return false;
1158     }
1159 
1160     bool changed = false;
1161 
1162     for (auto m = org_ref.SetOrgname().SetMod().begin(); m != org_ref.SetOrgname().SetMod().end(); m++) {
1163         if ((*m)->IsSetSubtype() &&
1164             (*m)->GetSubtype() == COrgMod::eSubtype_nat_host &&
1165             (*m)->IsSetSubname()) {
1166             string host_val = x_DefaultSpecificHostAdjustments((*m)->GetSubname());
1167             TSpecificHostRequests::const_iterator it = m_SpecificHostRequests.find(host_val);
1168             if (it != m_SpecificHostRequests.end()) {
1169                 const string& new_val = it->second.SuggestFix();
1170                 if (!NStr::IsBlank(new_val) && !NStr::Equal(new_val, (*m)->GetSubname())) {
1171                     (*m)->SetSubname(new_val);
1172                     changed = true;
1173                 }
1174             }
1175         }
1176     }
1177 
1178     return changed;
1179 }
1180 
1181 
x_DefaultSpecificHostAdjustments(const string & host_val)1182 string CTaxValidationAndCleanup::x_DefaultSpecificHostAdjustments(const string& host_val)
1183 {
1184     string adjusted = host_val;
1185     NStr::TruncateSpacesInPlace(adjusted);
1186     adjusted = COrgMod::FixHost(adjusted);
1187     return adjusted;
1188 }
1189 
1190 
IncrementalStrainMapUpdate(const vector<CRef<COrg_ref>> & input,const CTaxon3_reply & reply)1191 string CTaxValidationAndCleanup::IncrementalStrainMapUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply)
1192 {
1193     return m_StrainMap.IncrementalUpdate(input, reply);
1194 }
1195 
1196 
IsStrainMapUpdateComplete() const1197 bool CTaxValidationAndCleanup::IsStrainMapUpdateComplete() const
1198 {
1199     return m_StrainMap.IsUpdateComplete();
1200 }
1201 //LCOV_EXCL_STOP
1202 
1203 
ReportStrainErrors(CValidError_imp & imp)1204 void CTaxValidationAndCleanup::ReportStrainErrors(CValidError_imp& imp)
1205 {
1206     m_StrainMap.PostErrors(imp);
1207 }
1208 
GetSeqContext(size_t num) const1209 CConstRef<CSeq_entry> CTaxValidationAndCleanup::GetSeqContext(size_t num) const
1210 {
1211     return (num < m_DescCtxs.size()) ? m_DescCtxs[num] : CConstRef<CSeq_entry>();
1212 }
1213 
1214 
1215 //LCOV_EXCL_START
1216 //used by Genome Workbench, asn_cleanup, and table2asn but not asnvalidate
DoTaxonomyUpdate(CSeq_entry_Handle seh,bool with_host)1217 bool CTaxValidationAndCleanup::DoTaxonomyUpdate(CSeq_entry_Handle seh, bool with_host)
1218 {
1219     Init(*(seh.GetCompleteSeq_entry()));
1220 
1221     vector<CRef<COrg_ref> > original_orgs = GetTaxonomyLookupRequest();
1222     if (original_orgs.empty())
1223     {
1224         return false;
1225     }
1226     const size_t chunk_size = 1000;
1227     vector< CRef<COrg_ref> > edited_orgs;
1228 
1229     CTaxon3 taxon3;
1230     taxon3.Init();
1231     size_t i = 0;
1232     while (i < original_orgs.size())
1233     {
1234         size_t len = min(chunk_size, original_orgs.size() - i);
1235         vector< CRef<COrg_ref> >  tmp_original_orgs(original_orgs.begin() + i, original_orgs.begin() + i + len);
1236         vector< CRef<COrg_ref> >  tmp_edited_orgs;
1237         ITERATE(vector<CRef<COrg_ref> >, it, tmp_original_orgs)
1238         {
1239             CRef<COrg_ref> cpy(new COrg_ref());
1240             cpy->Assign(**it);
1241             tmp_edited_orgs.push_back(cpy);
1242         }
1243         CRef<CTaxon3_reply> tmp_lookup_reply = taxon3.SendOrgRefList(tmp_original_orgs);
1244         string error_message;
1245         AdjustOrgRefsWithTaxLookupReply(*tmp_lookup_reply, tmp_edited_orgs, error_message);
1246         if (!NStr::IsBlank(error_message))
1247         {
1248             // post error message
1249             ERR_POST(Error << error_message);
1250             return false;
1251         }
1252         edited_orgs.insert(edited_orgs.end(), tmp_edited_orgs.begin(), tmp_edited_orgs.end());
1253         i += len;
1254     }
1255 
1256     if (with_host) {
1257         vector< CRef<COrg_ref> > spec_host_rq = GetSpecificHostLookupRequest(true);
1258         i = 0;
1259         while (i < spec_host_rq.size())
1260         {
1261             size_t len = min(chunk_size, spec_host_rq.size() - i);
1262             vector< CRef<COrg_ref> > tmp_spec_host_rq(spec_host_rq.begin() + i, spec_host_rq.begin() + i + len);
1263             CRef<CTaxon3_reply> tmp_spec_host_reply = taxon3.SendOrgRefList(tmp_spec_host_rq);
1264             string error_message = IncrementalSpecificHostMapUpdate(tmp_spec_host_rq, *tmp_spec_host_reply);
1265             if (!NStr::IsBlank(error_message))
1266             {
1267                 // post error message
1268                 ERR_POST(Error << error_message);
1269                 return false;
1270             }
1271             i += len;
1272         }
1273 
1274         AdjustOrgRefsForSpecificHosts(edited_orgs);
1275     }
1276 
1277     // update descriptors
1278     size_t num_descs = NumDescs();
1279     size_t num_updated_descs = 0;
1280     for (size_t n = 0; n < num_descs; n++) {
1281         if (!original_orgs[n]->Equals(*(edited_orgs[n]))) {
1282             CSeqdesc* orig = const_cast<CSeqdesc *>(GetDesc(n).GetPointer());
1283             orig->SetSource().SetOrg().Assign(*(edited_orgs[n]));
1284             num_updated_descs++;
1285         }
1286     }
1287 
1288     // now update features
1289     size_t num_updated_feats = 0;
1290     for (size_t n = 0; n < NumFeats(); n++) {
1291         if (!original_orgs[n + num_descs]->Equals(*edited_orgs[n + num_descs])) {
1292             CConstRef<CSeq_feat> feat = GetFeat(n);
1293             CRef<CSeq_feat> new_feat(new CSeq_feat());
1294             new_feat->Assign(*feat);
1295             new_feat->SetData().SetBiosrc().SetOrg().Assign(*(edited_orgs[n + num_descs]));
1296 
1297             CSeq_feat_Handle fh = seh.GetScope().GetSeq_featHandle(*feat);
1298             CSeq_feat_EditHandle efh(fh);
1299             efh.Replace(*new_feat);
1300             num_updated_feats++;
1301         }
1302     }
1303     return (num_updated_descs > 0 || num_updated_feats > 0);
1304 }
1305 //LCOV_EXCL_STOP
1306 
1307 
1308 //LCOV_EXCL_START
1309 //only used by biosample
FixOneSpecificHost(string & val)1310 void CTaxValidationAndCleanup::FixOneSpecificHost(string& val)
1311 {
1312     val = x_DefaultSpecificHostAdjustments(val);
1313     string err_msg;
1314     if(IsOneSpecificHostValid(val, err_msg)) {
1315         return;
1316     }
1317     m_HostMapForFix.Clear();
1318     m_HostMapForFix.AddString(val);
1319 
1320     vector< CRef<COrg_ref> > spec_host_rq = m_HostMapForFix.GetRequestList();
1321     if (spec_host_rq.empty()) {
1322         m_HostMapForFix.Clear();
1323         return;
1324     }
1325     vector< CRef<COrg_ref> > edited;
1326     edited.push_back(CRef<COrg_ref>(new COrg_ref()));
1327     edited.front()->SetOrgname().SetMod().push_back(CRef<COrgMod>(new COrgMod(COrgMod::eSubtype_nat_host, val)));
1328 
1329     CTaxon3 taxon3;
1330     taxon3.Init();
1331     CRef<CTaxon3_reply> tmp_spec_host_reply = taxon3.SendOrgRefList(spec_host_rq);
1332 
1333     if (!tmp_spec_host_reply->IsSetReply() || !tmp_spec_host_reply->GetReply().front()->IsData()) {
1334         val = kEmptyStr;
1335         m_HostMapForFix.Clear();
1336         return;
1337     }
1338 
1339     string error_message = IncrementalSpecificHostMapUpdate(spec_host_rq, *tmp_spec_host_reply);
1340     if (!NStr::IsBlank(error_message))
1341     {
1342         // post error message
1343         ERR_POST(Error << error_message);
1344     }
1345 
1346 
1347     AdjustOrgRefsForSpecificHosts(edited);
1348 
1349     val = edited.front()->GetOrgname().GetMod().front()->GetSubname();
1350     m_HostMapForFix.Clear();
1351 }
1352 //LCOV_EXCL_STOP
1353 
1354 
1355 //LCOV_EXCL_START
1356 //only used by biosample
IsOneSpecificHostValid(const string & val,string & error_msg)1357 bool CTaxValidationAndCleanup::IsOneSpecificHostValid(const string& val, string& error_msg)
1358 {
1359     error_msg = kEmptyStr;
1360     m_HostMap.Clear();
1361 
1362     m_HostMap.AddString(val);
1363 
1364     vector< CRef<COrg_ref> > spec_host_rq = m_HostMap.GetRequestList();
1365     if (spec_host_rq.empty()) {
1366         m_HostMap.Clear();
1367         return true;
1368     }
1369 
1370     CTaxon3 taxon3;
1371     taxon3.Init();
1372     CRef<CTaxon3_reply> tmp_spec_host_reply = taxon3.SendOrgRefList(spec_host_rq);
1373 
1374     string err_msg;
1375     if (tmp_spec_host_reply) {
1376         err_msg = IncrementalSpecificHostMapUpdate(spec_host_rq, *tmp_spec_host_reply);
1377     } else {
1378         err_msg = "Connection to taxonomy failed";
1379     }
1380     bool rval = true;
1381     error_msg = err_msg;
1382 
1383     if (!NStr::IsBlank(err_msg)) {
1384         ERR_POST(Error << err_msg);
1385         m_HostMap.Clear();
1386         rval = false;
1387     } else {
1388         vector<TTaxError> errs;
1389         m_HostMap.ListErrors(errs);
1390         if (errs.size() > 0) {
1391             error_msg = errs.front().err_msg;
1392             rval = false;
1393         }
1394     }
1395     m_HostMap.Clear();
1396     return rval;
1397 }
1398 //LCOV_EXCL_STOP
1399 
1400 
CheckOneOrg(const COrg_ref & org,int genome,CValidError_imp & imp)1401 void CTaxValidationAndCleanup::CheckOneOrg(const COrg_ref& org, int genome, CValidError_imp& imp)
1402 {
1403     x_ClearMaps();
1404 
1405     vector<TTaxError> errs;
1406     CTaxon3 taxon3;
1407     taxon3.Init();
1408 
1409     // lookup of whole org
1410     vector< CRef<COrg_ref> > org_rq_list;
1411     CRef<COrg_ref> rq(new COrg_ref);
1412     rq->Assign(org);
1413     org_rq_list.push_back(rq);
1414 
1415     CRef<CTaxon3_reply> reply = taxon3.SendOrgRefList(org_rq_list);
1416 
1417     if (!reply || !reply->IsSetReply()) {
1418         imp.PostErr(eDiag_Error, eErr_SEQ_DESCR_TaxonomyServiceProblem,
1419                 "Taxonomy service connection failure", org);
1420     } else {
1421         ListTaxLookupErrors(*(reply->GetReply().front()), org, genome,
1422             false, false, errs);
1423     }
1424 
1425     // Now look at specific-host values
1426     m_HostMap.AddOrg(org);
1427     org_rq_list = GetSpecificHostLookupRequest(false);
1428 
1429     if (!org_rq_list.empty()) {
1430         reply = taxon3.SendOrgRefList(org_rq_list);
1431         string err_msg;
1432         if (reply) {
1433             err_msg = IncrementalSpecificHostMapUpdate(org_rq_list, *reply);
1434         } else {
1435             err_msg = "Connection to taxonomy failed";
1436         }
1437         if (!NStr::IsBlank(err_msg)) {
1438             imp.PostErr(eDiag_Error, eErr_SEQ_DESCR_TaxonomyLookupProblem, err_msg, org);
1439         } else {
1440             m_HostMap.ListErrors(errs);
1441         }
1442     }
1443 
1444 
1445     // validate strain
1446     m_StrainMap.AddOrg(org);
1447     org_rq_list = GetStrainLookupRequest();
1448     if (!org_rq_list.empty()) {
1449         reply = taxon3.SendOrgRefList(org_rq_list);
1450         string err_msg = IncrementalStrainMapUpdate(org_rq_list, *reply);
1451         if (!NStr::IsBlank(err_msg)) {
1452             imp.PostErr(eDiag_Error, eErr_SEQ_DESCR_TaxonomyLookupProblem, err_msg, org);
1453         } else {
1454             m_StrainMap.ListErrors(errs);
1455         }
1456     }
1457 
1458     for (auto it : errs) {
1459         imp.PostObjErr(it.severity, it.err_type, it.err_msg, org);
1460     }
1461 }
1462 
1463 
1464 END_SCOPE(validator)
1465 END_SCOPE(objects)
1466 END_NCBI_SCOPE
1467